Guest User

radioscanner.ru grabber

a guest
Jul 12th, 2014
492
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. import urllib
  3. import re
  4. page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=list&sortBy=0&page=0').read()
  5. pages = max([int(element) for element in re.findall(r'index.php\?action=list&sortBy=0&page=(\d+)', page_body)])
  6. grabbed_data = []
  7.  
  8. regex_id = re.compile('/base/freq(\d+)\.html')
  9. regex_freq = re.compile('(\d+\.\d+)(</span>)?</a>')
  10. regex_views = re.compile('<td class=caption1 style="padding-left:5;color:gray">(\d+)</td>')
  11. regex_acks = re.compile('<td valign="middle" class=caption1><div align="center"><font color="#666666">(\d+)</font></div></td>')
  12. regex_comments = re.compile('<td valign="middle" class=caption1><div align="center">(\d+)</div></td>')
  13.  
  14. logfile = open('radioscanner_base.csv', 'w')
  15.  
  16. logfile.write("'ID','Frequency','Active/Inactive','Place','Date','Modulation','Signal type','Radiocommunication service','Ownership','Callsign','Views','Acks ','Comments','Description'\n")
  17.  
  18. print 'Pages:', pages+1
  19.  
  20. for page_num in range(pages + 1):
  21.     print 'Page #%i' % page_num
  22.     page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=list&sortBy=0&page=%i' % page_num).read()
  23.     page_body = ''.join(page_body.split('<tr valign="top" class="tbCel1">')[1:])
  24.     page_body = ''.join(page_body.split('<table class=forums><tr><td align="right"><small></small></td></tr></table>')[:-1])
  25.     page_body = page_body.replace('\r', '').replace('\n\n', '\n')
  26.     page_body = page_body.split('</tr>')[:-1]
  27.     for element in page_body:
  28.         id = regex_id.search(element).group(1)
  29.         freq = regex_freq.search(element).group(1)
  30.         active = ['active', 'inactive']["<span style='color:D40202;'>" in element]
  31.         place_and_time = element.split('<td class=caption1 style="padding-left:6;padding-right:5;">')[1].split('</small></td>')[0]
  32.         place = place_and_time.split('<br> <small style="color:#999999;">')[0]
  33.         time = place_and_time.split('<br> <small style="color:#999999;">')[1]
  34.         modulation_and_type = element.split('<td class=caption1 style="padding-left:5"><strong>')[1].split('</td>')[0]
  35.         modulation = modulation_and_type.split('</strong><br>')[0]
  36.         type= modulation_and_type.split('</strong><br>')[1]
  37.         service = element.split('<td class=caption1 style="padding-left:6;padding-right:5;"><small>')[1].split('</small></td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  38.         own = element.split('<td class=caption1 style="padding-left:5">')[2].split('</td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  39.         callsign = element.split('td class=caption1 style="padding-left:5"><small>')[1].split('</small></td>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  40.         try: views = regex_views.search(element).group(1)
  41.         except: views = '0'
  42.         try:    acks = regex_acks.search(element).group(1)
  43.         except: acks = '0'
  44.         try: comments = regex_comments.search(element).group(1)
  45.         except: comments = '0'
  46.         if 'javascript:PopUp' in element:
  47.             page_body = urllib.urlopen('http://www.radioscanner.ru/base/index.php?action=freqdescr&freq_id=%s' % id).read()
  48.             description = page_body.split('<td class=caption5 width="30%">')[1].split('</td></tr>')[0].replace('<br>', '\n').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&#39;', '"')
  49.         else:
  50.             description = ''
  51.         #grabbed_data.append({'id':id, 'freq':freq, 'active':active, 'place':place, 'time':time, 'modulation':modulation, 'type':type, 'service':service, 'own':own, 'callsign':callsign})
  52.         logfile.write(id + ',')
  53.         logfile.write(freq + ',')
  54.         logfile.write("'"+active+"'" + ',')
  55.         logfile.write("'"+place+"'" + ',')
  56.         logfile.write("'"+time+"'" + ',')
  57.         logfile.write("'"+modulation+"'" + ',')
  58.         logfile.write("'"+type+"'" + ',')
  59.         logfile.write("'"+service+"'" + ',')
  60.         logfile.write("'"+own+"'" + ',')
  61.         logfile.write("'"+callsign+"'" + ',')
  62.         logfile.write("'"+views+"'" + ',')
  63.         logfile.write("'"+acks+"'" + ',')
  64.         logfile.write("'"+comments+"'" + ',')
  65.         logfile.write("'"+description+"'")
  66.         logfile.write('\n')
  67.     logfile.flush()
  68. logfile.close()
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×