Advertisement
joric

megaparser.py

Feb 2nd, 2017
379
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.30 KB | None | 0 0
  1. """
  2. Universal (and actually very cool) web crawler and parser, uses versatile .ini configuration file.
  3.  
  4. megaparser.ini example:
  5.  
  6. [settings]
  7. ;default=samsung
  8. ;default=nokia
  9. ;default=lg
  10. default=programmers
  11.  
  12. [samsung]
  13. url=http://innovator.samsungmobile.com/prd/sym/product.list.do?platformId=3
  14. param=movePage.+?curPage=(\d+)
  15. template=samsung_page
  16. alias=samsung_page
  17.  
  18. [samsung_page]
  19. url=http://innovator.samsungmobile.com/prd/sym/product.list.do?searchType=0&platformId=3&platform=&menuName=Symbian&sortingType=1&deviceType=&curPage=$1&listLines=10&cookieYN=Y&platformAll=
  20. param=onclick="goDetail\('([^`]+?)'
  21. template=samsung_model
  22.  
  23. [samsung_model]
  24. url=http://innovator.samsungmobile.com/prd/sym/productDetl.view.do?modelCode=$1
  25. vendor=(Samsung)
  26. model=<h2 class="fl">([^<^\s]+)\s*</h2>
  27. width=al Resolution:[^\d]{1,5}?(\d{2,4})[^\d]{1,3}\d{2,4}
  28. height=al Resolution:[^\d]{1,5}?\d{2,4}[^\d]{1,3}(\d{2,4})
  29. midp=MIDP[- ]*?([.0-9]+)
  30. cldc=CLDC[- ]*?([.0-9]+)
  31. ;image=(http://innovator.samsungmobile.com/image.do\?serviceId=prd&modelCode=.+?&imgType=1)
  32.  
  33. [jbenchmark]
  34. url=http://www.jbenchmark.com/index.jsp
  35. param=<option class="phone_list" >([^<]+)</option>
  36. template=jbenchmark_model
  37.  
  38. [jbenchmark_model]
  39. url=http://www.jbenchmark.com/phonedetails.jsp?benchmark=jvm&D=$1&testgroup=null
  40. model=class="label2">(.+?) performance details
  41. width=Canvas Size.+?(\d+)x\d+
  42. height=Canvas Size.+?\d+x(\d+)
  43. cldc=ME Configuration.+?CLDC-([.0-9]+)
  44. midp=ME Profiles.+?MIDP-([.0-9]+)
  45.  
  46. [nokia]
  47. url=http://www.developer.nokia.com/Devices/Device_specifications/?filter1=all
  48. param="/Devices/Device_specifications/(.*?)"
  49. template=nokia_model
  50.  
  51. [nokia_model]
  52. url=http://www.developer.nokia.com/Devices/Device_specifications/$1/
  53. vendor=(Nokia)
  54. model=<title>.+?- Nokia ([^<]+)\s*?</title>
  55. width=property="dp:screen_width" content="(\d+)"
  56. height=property="dp:screen_height" content="(\d+)"
  57. midp=MIDP ([.0-9]+)
  58. cldc=\(CLDC\) ([.0-9]+)
  59.  
  60. [motorola]
  61. url=http://developer.motorola.com/products/handsets-other/?num=all
  62. param=<a class="more" href="/products/handsets-other/([^"]+)">
  63. template=motorola_model
  64.  
  65. [motorola_model]
  66. url=http://developer.motorola.com/products/handsets-other/$1
  67. vendor=(Motorola)
  68.  
  69. [sonyericsson]
  70. url=http://developer.sonyericsson.com/wportal/devworld/phones/phone-gallery?cc=gb&lc=en
  71. param=href="/wportal/devworld/phones/phone-overview/(.+?)\?
  72. template=sonyericsson_model
  73.  
  74. [sonyericsson_model]
  75. url=http://developer.sonyericsson.com/wportal/devworld/phones/phone-overview/$1?cc=gb&lc=en
  76. vendor=(SonyEricsson)
  77. model=<h2 id="productName">([^<]+)</h2>
  78. width=(\d+)[ x]+\d+ pixel
  79. height=\d+[ x]+(\d+) pixel
  80. midp=MIDP ([.0-9]+)
  81. cldc=CLDC ([.0-9]+)
  82.  
  83.  
  84. [lg]
  85. url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneList.dev?technicalTexts=Java&chkTechnical=Java&rowSize=12
  86. param=onclick="goPage\('([^']+)'\)"
  87. template=lg_row
  88. alias=lg_row
  89.  
  90. [lg_row]
  91. url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneList.dev?technicalTexts=Java&chkTechnical=Java&rowSize=12&targetRow=$1
  92. param=onclick="retrieveSubmit\('([^']+)'\)"
  93. template=lg_model
  94.  
  95. [lg_model]
  96. url=http://developer.lgmobile.com/lge.mdn.pho.RetrievePhoneInfo.dev?modelName=$1
  97. vendor=(LG)
  98. model=name="modelName" value="([^\"]+)"
  99. width=Full Screen.+?(\d+)[^\d]+\d+
  100. height=Full Screen.+?\d+[^\d]+(\d+)
  101. midp=MIDP ([.0-9]+)
  102. cldc=CLDC ([.0-9]+)
  103.  
  104.  
  105. [programmers]
  106. url=http://en.wikipedia.org/wiki/List_of_programmers
  107. param=<li><a href="/wiki/([^"]+)" title=
  108. template=programmers_page
  109.  
  110. [programmers_page]
  111. url=http://en.wikipedia.org/wiki/$1
  112. name=<title>(.+?) - Wikipedia, the free encyclopedia</title>
  113. born=<span class="bday">(.+?)</span>
  114. died=<span class="dday">(.+?)</span>
  115.  
  116.  
  117. """
  118.  
  119. from ConfigParser import ConfigParser
  120. import urllib2
  121. import csv
  122. import sys
  123. import re
  124.  
  125. db = []
  126. db_fields = []
  127. counter = 0
  128. total = 0
  129.  
  130. def db_format(var, s):
  131. #    s = asciify(s)
  132.     return s    
  133.  
  134. def dump_db():
  135.     w = csv.DictWriter(sys.stdout, fieldnames=db_fields)
  136. #    w.writeheader()
  137.     for row in db:
  138.         w.writerow(row)
  139. #        print row
  140.  
  141. def asciify(s):
  142.     s = s.decode("utf-8")
  143.     s = s.replace(u"\u2013", "-")
  144.     s = re.sub("[^\x20-\x7F]"," ", s)
  145.     s = re.sub(" +"," ", s)
  146.     return s.strip()
  147.  
  148. def tagify(s):
  149.     s = asciify(s)
  150.     s = re.sub("[^a-zA-Z0-9-]","_", s)
  151.     return s
  152.  
  153. def get_page(section, url, key):
  154.     global counter, total
  155.     fname = "download/%s/%s.html" % (section, tagify(key))
  156.     try:
  157.         text = file(fname).read()
  158.     except:
  159.         sys.stderr.write("%s\nDownloading page %d of %d...\n" % (url, counter, total))
  160.         text = wget(url, fname)  
  161.     counter += 1
  162.     return text
  163.  
  164. def wget(url, fname):
  165.     import os
  166.     req = urllib2.Request(url, None, {'User-Agent':'megaparser'})
  167.     response = urllib2.urlopen(req)
  168.     text = response.read()
  169.     d = os.path.dirname(fname)
  170.     if not os.path.exists(d):
  171.         os.makedirs(d)
  172.     fp = open(fname, "wb")
  173.     fp.write(text)
  174.     fp.close()
  175.     return text
  176.  
  177. def get_var(conf, sec, var):
  178.     try:
  179.         return conf.get(sec, var)
  180.     except:
  181.         return ""
  182.  
  183. def get_keys(reg, text):
  184.     keys = []
  185.     for m in re.finditer(reg, text, re.MULTILINE | re.DOTALL):
  186.         if len(m.groups()):
  187.             k = m.group(1)
  188.             if k not in keys:
  189.                 keys.append(k)
  190.     return keys
  191.  
  192. def get_values(conf, sec, text, url):
  193.     global db_fields
  194.     res = {}
  195.     for var in db_fields:
  196.         reg = get_var(conf, sec, var)
  197.         vals = []
  198.         if reg:
  199.             for m in re.finditer(reg, text, re.MULTILINE | re.DOTALL):
  200.                 if m and len(m.groups()):
  201.                     #print text[m.start():m.end()], "=>", s #debug
  202.                     val = m.group(1)
  203.                     #print val.decode('utf-8').encode('cp866')
  204.                     val = val.replace('\n','\\n')
  205.                     #sys.stderr.write("%s\n" % val)
  206.                     vals.append(val)
  207.                 res[var] = val
  208.     if res:
  209.         for var in db_fields:
  210.             if var not in res.keys():
  211.                 res[var] = ""
  212.         if "url" in db_fields:
  213.             res["url"] = url
  214.         db.append(res)
  215.  
  216. def get_fields(conf, sec):
  217.     res = []
  218.     for i in conf.items(sec):
  219.         if i[0] not in ["url", "param", "template", "alias"]:
  220.             res.append(i[0])
  221.     return res
  222.  
  223. def process(conf, sec, text, parent, url):
  224.     global db_fields
  225.     global total
  226.     get_values(conf, sec, text, url)
  227.     reg = get_var(conf, sec, "param")
  228.     if reg:
  229.         keys = get_keys(reg, text)
  230.         total += len(keys)
  231.         sec = get_var(conf, sec, "template")
  232.         if sec:
  233.             db_fields = get_fields(conf, sec)
  234.             for key in keys:
  235.                 parse(conf, sec, key, parent)
  236.  
  237. def parse(conf, sec, key="index", parent=""):
  238.     global db_fields
  239.  
  240.     if parent == "":
  241.         parent = sec
  242.  
  243.     url = get_var(conf, sec, "url").replace("$1", key)
  244.  
  245.     text = get_page(parent, url, key)
  246.  
  247.     alias = get_var(conf, sec, "alias")
  248.  
  249.     if alias:
  250.         process(conf, alias, text, parent, url)
  251.         db_fields = get_fields(conf, alias)
  252.         get_values(conf, sec, text, url)
  253.  
  254.     process(conf, sec, text, parent, url)
  255.  
  256. if __name__ == '__main__':
  257.     conf = ConfigParser()
  258.     conf.read("megaparser.ini")
  259.     section = get_var(conf, "settings", "default")
  260.     parse(conf, section)
  261.     dump_db()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement