Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # <markdown>
- # This is a lab a obout interacting with webpages and xml (I think)
- # <codecell>
- import urllib2
- import bs4
- import json
- import datetime as dt
- import pandas as pd
- import numpy as np
- import unicodedata
- # <markdown>
- # urllib2 is a useful module to get information from the web
- # (unless it is javascript protected)
- # the function urlopen() opens a url
- # to read the entire html to a single string, use read()
- # to read line by line, use readline()
- # read() reads the html code and close() closes the connection
- # keep reading this to get used to this library.
- # https://docs.python.org/2/library/urllib2.html
- # <codecell>
- x = urllib2.urlopen("http://www.google.com")
- htmlSource = x.read()
- x.close()
- type(htmlSource)
- print htmlSource[:800]
- # <markdown>
- # reading the html source is ok with now you have parse it (both html and xml)
- # for which we will use beautifulsoup
- # lets try some but with different url as google.com is silly
- # <codecell>
- x = urllib2.urlopen("http://www.reddit.com")
- htmlSource = x.read()
- x.close()
- print htmlSource[:500]
- # <codecell>
- soup = bs4.BeautifulSoup(htmlSource)
- print type(soup)
- print soup.prettify()[:100]
- print soup.head.prettify()[:600]
Advertisement
Add Comment
Please, Sign In to add comment