Advertisement
Guest User

radioscanner.ru grabber

a guest
Jul 12th, 2014
689
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.21 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import urllib
  3. import re
  4. page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=list&sortBy=0&page=0').read()
  5. pages = max([int(element) for element in re.findall(r'index.php\?action=list&sortBy=0&page=(\d+)', page_body)])
  6. grabbed_data = []
  7.  
  8. regex_id = re.compile('/base/freq(\d+)\.html')
  9. regex_freq = re.compile('(\d+\.\d+)(</span>)?</a>')
  10. regex_views = re.compile('<td class=caption1 style="padding-left:5;color:gray">(\d+)</td>')
  11. regex_acks = re.compile('<td valign="middle" class=caption1><div align="center"><font color="#666666">(\d+)</font></div></td>')
  12. regex_comments = re.compile('<td valign="middle" class=caption1><div align="center">(\d+)</div></td>')
  13.  
  14. logfile = open('radioscanner_base.csv', 'w')
  15.  
  16. logfile.write("'ID','Frequency','Active/Inactive','Place','Date','Modulation','Signal type','Radiocommunication service','Ownership','Callsign','Views','Acks ','Comments','Description'\n")
  17.  
  18. print 'Pages:', pages+1
  19.  
  20. for page_num in range(pages + 1):
  21.     print 'Page #%i' % page_num
  22.     page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=list&sortBy=0&page=%i' % page_num).read()
  23.     page_body = ''.join(page_body.split('<tr valign="top" class="tbCel1">')[1:])
  24.     page_body = ''.join(page_body.split('<table class=forums><tr><td align="right"><small></small></td></tr></table>')[:-1])
  25.     page_body = page_body.replace('\r', '').replace('\n\n', '\n')
  26.     page_body = page_body.split('</tr>')[:-1]
  27.     for element in page_body:
  28.         id = regex_id.search(element).group(1)
  29.         freq = regex_freq.search(element).group(1)
  30.         active = ['active', 'inactive']["<span style='color:D40202;'>" in element]
  31.         place_and_time = element.split('<td class=caption1 style="padding-left:6;padding-right:5;">')[1].split('</small></td>')[0]
  32.         place = place_and_time.split('<br> <small style="color:#999999;">')[0]
  33.         time = place_and_time.split('<br> <small style="color:#999999;">')[1]
  34.         modulation_and_type = element.split('<td class=caption1 style="padding-left:5"><strong>')[1].split('</td>')[0]
  35.         modulation = modulation_and_type.split('</strong><br>')[0]
  36.         type= modulation_and_type.split('</strong><br>')[1]
  37.         service = element.split('<td class=caption1 style="padding-left:6;padding-right:5;"><small>')[1].split('</small></td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  38.         own = element.split('<td class=caption1 style="padding-left:5">')[2].split('</td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  39.         callsign = element.split('td class=caption1 style="padding-left:5"><small>')[1].split('</small></td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  40.         try: views = regex_views.search(element).group(1)
  41.         except: views = '0'
  42.         try:    acks = regex_acks.search(element).group(1)
  43.         except: acks = '0'
  44.         try: comments = regex_comments.search(element).group(1)
  45.         except: comments = '0'
  46.         if 'javascript:PopUp' in element:
  47.             page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=freqdescr&freq_id=%s' % id).read()
  48.             description = page_body.split('<td class=caption5 width="30%">')[1].split('</td></tr>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  49.         else:
  50.             description = ''
  51.         #grabbed_data.append({'id':id, 'freq':freq, 'active':active, 'place':place, 'time':time, 'modulation':modulation, 'type':type, 'service':service, 'own':own, 'callsign':callsign})
  52.         logfile.write(id + ',')
  53.         logfile.write(freq + ',')
  54.         logfile.write("'"+active+"'" + ',')
  55.         logfile.write("'"+place+"'" + ',')
  56.         logfile.write("'"+time+"'" + ',')
  57.         logfile.write("'"+modulation+"'" + ',')
  58.         logfile.write("'"+type+"'" + ',')
  59.         logfile.write("'"+service+"'" + ',')
  60.         logfile.write("'"+own+"'" + ',')
  61.         logfile.write("'"+callsign+"'" + ',')
  62.         logfile.write("'"+views+"'" + ',')
  63.         logfile.write("'"+acks+"'" + ',')
  64.         logfile.write("'"+comments+"'" + ',')
  65.         logfile.write("'"+description+"'")
  66.         logfile.write('\n')
  67.     logfile.flush()
  68. logfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement