Using Sockets
The following is similar to a telnet session, where we first connect to the server, we submit a GET request, and we read the response.
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect( ('data.pr4e.org', 80) )
cmd = 'GET HTTP://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)
while True:
data = mysock.recv(512)
if (len(data) < 1):
break
print(data.decode())
mysock.close()
The encode method converts from unicode to UTF-8. The decode method performs the reverse conversion.
Using urllib
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
print(line.decode().strip())
Using Beautiful Soup
Beautiful Soup is a library used to parse HTML documents.
Link: Beautiful Soup
Example 1: Read span Tags
# To run this, install BeautifulSoup
# conda install -c anaconda beautifulsoup4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#url = input('Enter - ')
url = 'http://some-site/some-document.html'
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# Retrieve all of the anchor tags
tags = soup('span')
total = 0
for tag in tags:
# Look at the parts of a tag
print('TAG:', tag)
print('URL:', tag.get('href', None))
print('Contents:', tag.string)
print('Attrs:', tag.attrs)
total += int(tag.string)
print('Count ', len(tags))
print('Sum ', total)
Example 2: Follow Links in a Page
# To run this, install BeautifulSoup
# conda install -c anaconda beautifulsoup4
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Returns the a tag in the argument position for the given url
def get_link_tag(url, position):
print('Retrieving:', url)
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
result = None
result = tags[position - 1]
return result
# url = input('Enter URL: ')
url = 'http://some-site/some-page.html'
count = input('Enter count: ')
position = input('Enter position: ')
done = 0
while done <= int(count):
tag = get_link_tag(url, int(position))
url = tag.get('href', None)
done += 1