Using Sockets
The following is similar to a telnet session, where we first connect to the server, we submit a GET request, and we read the response.
import socket mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) mysock.connect( ('data.pr4e.org', 80) ) cmd = 'GET HTTP://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() mysock.send(cmd) while True: data = mysock.recv(512) if (len(data) < 1): break print(data.decode()) mysock.close()
The encode method converts from unicode to UTF-8. The decode method performs the reverse conversion.
Using urllib
import urllib.request, urllib.parse, urllib.error fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt') for line in fhand: print(line.decode().strip())
Using Beautiful Soup
Beautiful Soup is a library used to parse HTML documents.
Link: Beautiful Soup
Example 1: Read span Tags
# To run this, install BeautifulSoup # conda install -c anaconda beautifulsoup4 from urllib.request import urlopen from bs4 import BeautifulSoup import ssl # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE #url = input('Enter - ') url = 'http://some-site/some-document.html' html = urlopen(url, context=ctx).read() soup = BeautifulSoup(html, "html.parser") # Retrieve all of the anchor tags tags = soup('span') total = 0 for tag in tags: # Look at the parts of a tag print('TAG:', tag) print('URL:', tag.get('href', None)) print('Contents:', tag.string) print('Attrs:', tag.attrs) total += int(tag.string) print('Count ', len(tags)) print('Sum ', total)
Example 2: Follow Links in a Page
# To run this, install BeautifulSoup # conda install -c anaconda beautifulsoup4 import urllib.request, urllib.parse, urllib.error from bs4 import BeautifulSoup import ssl # Returns the a tag in the argument position for the given url def get_link_tag(url, position): print('Retrieving:', url) # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE html = urllib.request.urlopen(url, context=ctx).read() soup = BeautifulSoup(html, 'html.parser') # Retrieve all of the anchor tags tags = soup('a') result = None result = tags[position - 1] return result # url = input('Enter URL: ') url = 'http://some-site/some-page.html' count = input('Enter count: ') position = input('Enter position: ') done = 0 while done <= int(count): tag = get_link_tag(url, int(position)) url = tag.get('href', None) done += 1