Using Sockets

The following is similar to a telnet session, where we first connect to the server, we submit a GET request, and we read the response.

import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect( ('data.pr4e.org', 80) )

cmd = 'GET HTTP://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if (len(data) < 1):
        break
    print(data.decode())

mysock.close()

The encode method converts from unicode to UTF-8.  The decode method performs the reverse conversion.

Using urllib

import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    print(line.decode().strip())

Using Beautiful Soup

Beautiful Soup is a library used to parse HTML documents.

Link: Beautiful Soup

Example 1: Read span Tags

# To run this, install BeautifulSoup 
# conda install -c anaconda beautifulsoup4


from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#url = input('Enter - ')
url = 'http://some-site/some-document.html'
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the anchor tags
tags = soup('span')
total = 0
for tag in tags:
    # Look at the parts of a tag
    print('TAG:', tag)
    print('URL:', tag.get('href', None))
    print('Contents:', tag.string)
    print('Attrs:', tag.attrs)
    total += int(tag.string)

print('Count ', len(tags))
print('Sum ', total)

Example 2: Follow Links in a Page

# To run this, install BeautifulSoup 
# conda install -c anaconda beautifulsoup4

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Returns the a tag in the argument position for the given url
def get_link_tag(url, position):
    print('Retrieving:', url)
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE    
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all of the anchor tags
    tags = soup('a')
    result = None
    result = tags[position - 1]
    return result


# url = input('Enter URL: ')
url = 'http://some-site/some-page.html'
count = input('Enter count: ')
position = input('Enter position: ')

done = 0
while done <= int(count):
    tag = get_link_tag(url, int(position))
    url = tag.get('href', None)
    done += 1