Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Universal (and actually very cool) web crawler and parser, uses versatile .ini configuration file.
- megaparser.ini example:
- [settings]
- ;default=samsung
- ;default=nokia
- ;default=lg
- default=programmers
- [samsung]
- url=http://innovator.samsungmobile.com/prd/sym/product.list.do?platformId=3
- param=movePage.+?curPage=(\d+)
- template=samsung_page
- alias=samsung_page
- [samsung_page]
- url=http://innovator.samsungmobile.com/prd/sym/product.list.do?searchType=0&platformId=3&platform=&menuName=Symbian&sortingType=1&deviceType=&curPage=$1&listLines=10&cookieYN=Y&platformAll=
- param=onclick="goDetail\('([^`]+?)'
- template=samsung_model
- [samsung_model]
- url=http://innovator.samsungmobile.com/prd/sym/productDetl.view.do?modelCode=$1
- vendor=(Samsung)
- model=<h2 class="fl">([^<^\s]+)\s*</h2>
- width=al Resolution:[^\d]{1,5}?(\d{2,4})[^\d]{1,3}\d{2,4}
- height=al Resolution:[^\d]{1,5}?\d{2,4}[^\d]{1,3}(\d{2,4})
- midp=MIDP[- ]*?([.0-9]+)
- cldc=CLDC[- ]*?([.0-9]+)
- ;image=(http://innovator.samsungmobile.com/image.do\?serviceId=prd&modelCode=.+?&imgType=1)
- [jbenchmark]
- url=http://www.jbenchmark.com/index.jsp
- param=<option class="phone_list" >([^<]+)</option>
- template=jbenchmark_model
- [jbenchmark_model]
- url=http://www.jbenchmark.com/phonedetails.jsp?benchmark=jvm&D=$1&testgroup=null
- model=class="label2">(.+?) performance details
- width=Canvas Size.+?(\d+)x\d+
- height=Canvas Size.+?\d+x(\d+)
- cldc=ME Configuration.+?CLDC-([.0-9]+)
- midp=ME Profiles.+?MIDP-([.0-9]+)
- [nokia]
- url=http://www.developer.nokia.com/Devices/Device_specifications/?filter1=all
- param="/Devices/Device_specifications/(.*?)"
- template=nokia_model
- [nokia_model]
- url=http://www.developer.nokia.com/Devices/Device_specifications/$1/
- vendor=(Nokia)
- model=<title>.+?- Nokia ([^<]+)\s*?</title>
- width=property="dp:screen_width" content="(\d+)"
- height=property="dp:screen_height" content="(\d+)"
- midp=MIDP ([.0-9]+)
- cldc=\(CLDC\) ([.0-9]+)
- [motorola]
- url=http://developer.motorola.com/products/handsets-other/?num=all
- param=<a class="more" href="/products/handsets-other/([^"]+)">
- template=motorola_model
- [motorola_model]
- url=http://developer.motorola.com/products/handsets-other/$1
- vendor=(Motorola)
- [sonyericsson]
- url=http://developer.sonyericsson.com/wportal/devworld/phones/phone-gallery?cc=gb&lc=en
- param=href="/wportal/devworld/phones/phone-overview/(.+?)\?
- template=sonyericsson_model
- [sonyericsson_model]
- url=http://developer.sonyericsson.com/wportal/devworld/phones/phone-overview/$1?cc=gb&lc=en
- vendor=(SonyEricsson)
- model=<h2 id="productName">([^<]+)</h2>
- width=(\d+)[ x]+\d+ pixel
- height=\d+[ x]+(\d+) pixel
- midp=MIDP ([.0-9]+)
- cldc=CLDC ([.0-9]+)
- [lg]
- url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneList.dev?technicalTexts=Java&chkTechnical=Java&rowSize=12
- param=onclick="goPage\('([^']+)'\)"
- template=lg_row
- alias=lg_row
- [lg_row]
- url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneList.dev?technicalTexts=Java&chkTechnical=Java&rowSize=12&targetRow=$1
- param=onclick="retrieveSubmit\('([^']+)'\)"
- template=lg_model
- [lg_model]
- url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneInfo.dev?modelName=$1
- vendor=(LG)
- model=name="modelName" value="([^\"]+)"
- width=Full Screen.+?(\d+)[^\d]+\d+
- height=Full Screen.+?\d+[^\d]+(\d+)
- midp=MIDP ([.0-9]+)
- cldc=CLDC ([.0-9]+)
- [programmers]
- url=http://en.wikipedia.org/wiki/List_of_programmers
- param=<li><a href="/wiki/([^"]+)" title=
- template=programmers_page
- [programmers_page]
- url=http://en.wikipedia.org/wiki/$1
- name=<title>(.+?) - Wikipedia, the free encyclopedia</title>
- born=<span class="bday">(.+?)</span>
- died=<span class="dday">(.+?)</span>
- """
- from ConfigParser import ConfigParser
- import urllib2
- import csv
- import sys
- import re
- db = []
- db_fields = []
- counter = 0
- total = 0
- def db_format(var, s):
- # s = asciify(s)
- return s
- def dump_db():
- w = csv.DictWriter(sys.stdout, fieldnames=db_fields)
- # w.writeheader()
- for row in db:
- w.writerow(row)
- # print row
- def asciify(s):
- s = s.decode("utf-8")
- s = s.replace(u"\u2013", "-")
- s = re.sub("[^\x20-\x7F]"," ", s)
- s = re.sub(" +"," ", s)
- return s.strip()
- def tagify(s):
- s = asciify(s)
- s = re.sub("[^a-zA-Z0-9-]","_", s)
- return s
- def get_page(section, url, key):
- global counter, total
- fname = "download/%s/%s.html" % (section, tagify(key))
- try:
- text = file(fname).read()
- except:
- sys.stderr.write("%s\nDownloading page %d of %d...\n" % (url, counter, total))
- text = wget(url, fname)
- counter += 1
- return text
- def wget(url, fname):
- import os
- req = urllib2.Request(url, None, {'User-Agent':'megaparser'})
- response = urllib2.urlopen(req)
- text = response.read()
- d = os.path.dirname(fname)
- if not os.path.exists(d):
- os.makedirs(d)
- fp = open(fname, "wb")
- fp.write(text)
- fp.close()
- return text
- def get_var(conf, sec, var):
- try:
- return conf.get(sec, var)
- except:
- return ""
- def get_keys(reg, text):
- keys = []
- for m in re.finditer(reg, text, re.MULTILINE | re.DOTALL):
- if len(m.groups()):
- k = m.group(1)
- if k not in keys:
- keys.append(k)
- return keys
- def get_values(conf, sec, text, url):
- global db_fields
- res = {}
- for var in db_fields:
- reg = get_var(conf, sec, var)
- vals = []
- if reg:
- for m in re.finditer(reg, text, re.MULTILINE | re.DOTALL):
- if m and len(m.groups()):
- #print text[m.start():m.end()], "=>", s #debug
- val = m.group(1)
- #print val.decode('utf-8').encode('cp866')
- val = val.replace('\n','\\n')
- #sys.stderr.write("%s\n" % val)
- vals.append(val)
- res[var] = val
- if res:
- for var in db_fields:
- if var not in res.keys():
- res[var] = ""
- if "url" in db_fields:
- res["url"] = url
- db.append(res)
- def get_fields(conf, sec):
- res = []
- for i in conf.items(sec):
- if i[0] not in ["url", "param", "template", "alias"]:
- res.append(i[0])
- return res
- def process(conf, sec, text, parent, url):
- global db_fields
- global total
- get_values(conf, sec, text, url)
- reg = get_var(conf, sec, "param")
- if reg:
- keys = get_keys(reg, text)
- total += len(keys)
- sec = get_var(conf, sec, "template")
- if sec:
- db_fields = get_fields(conf, sec)
- for key in keys:
- parse(conf, sec, key, parent)
- def parse(conf, sec, key="index", parent=""):
- global db_fields
- if parent == "":
- parent = sec
- url = get_var(conf, sec, "url").replace("$1", key)
- text = get_page(parent, url, key)
- alias = get_var(conf, sec, "alias")
- if alias:
- process(conf, alias, text, parent, url)
- db_fields = get_fields(conf, alias)
- get_values(conf, sec, text, url)
- process(conf, sec, text, parent, url)
- if __name__ == '__main__':
- conf = ConfigParser()
- conf.read("megaparser.ini")
- section = get_var(conf, "settings", "default")
- parse(conf, section)
- dump_db()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement