does_not_work_lighttable

# <markdown>
# This is a lab a obout interacting with webpages and xml (I think)

# <codecell>

import urllib2
import bs4
import json
import datetime as dt
import pandas as pd
import numpy as np
import unicodedata

# <markdown>
# urllib2 is a useful module to get information from the web
# (unless it is javascript protected)
# the function urlopen() opens a url
# to read the entire html to a single string, use read()
# to read line by line, use readline()
# read() reads the html code and close() closes the connection

# keep reading this to get used to this library.
# https://docs.python.org/2/library/urllib2.html

# <codecell>
x = urllib2.urlopen("http://www.google.com")
htmlSource = x.read()
x.close()
type(htmlSource)
print htmlSource[:800]

# <markdown>
# reading the html source is ok with now you have parse it (both html and xml)
# for which we will use beautifulsoup
# lets try some but with different url as google.com is silly

# <codecell>
x = urllib2.urlopen("http://www.reddit.com")
htmlSource = x.read()
x.close()
print htmlSource[:500]


# <codecell>
soup = bs4.BeautifulSoup(htmlSource)
print type(soup)
print soup.prettify()[:100]
print soup.head.prettify()[:600]