Advertisement
Carto_

Untitled

Apr 24th, 2012
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.79 KB | None | 0 0
  1. from twisted.web import client
  2. from twisted.internet import reactor, defer
  3. from bs4 import BeautifulSoup as BeautifulSoup
  4. import time
  5. import xlwt
  6.  
  7. start = time.time()
  8. wb = xlwt.Workbook(encoding='utf-8')
  9. ws = wb.add_sheet("BULATS_IA_PARSED")
  10. global x
  11. x = 0
  12. Countries_List = ['Afghanistan','Armenia','Brazil','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Cameroon','Canada','Central African Republic','Chile','China','Colombia','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','Ecuador','Egypt','Eritrea','Estonia','Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia','Georgia','Germany','Gibraltar','Greece','Grenada','Hong Kong','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kuwait','Latvia','Lebanon','Libya','Liechtenstein','Lithuania','Luxembourg','Macau','Macedonia','Malaysia','Maldives','Malta','Mexico','Monaco','Montenegro','Morocco','Mozambique','Myanmar (Burma)','Nepal','Netherlands','New Caledonia','New Zealand','Nigeria','Norway','Oman','Pakistan','Palestine','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Saudi Arabia','Serbia','Singapore','Slovakia','Slovenia','South Africa','South Korea','Spain','Sri Lanka','Sweden','Switzerland','Syria','Taiwan','Thailand','Trinadad and Tobago','Tunisia','Turkey','Ukraine','United Arab Emirates','United Kingdom','United States','Uruguay','Uzbekistan','Venezuela','Vietnam']
  13. urls = ["http://www.cambridgeesol.org/institutions/results.php?region=%s&type=&BULATS=on" % Countries for Countries in Countries_List]
  14.  
  15.  
  16. def finish(results):
  17.     global x
  18.     for result in results:
  19.         print 'GOT PAGE', len(result), 'bytes'
  20.         soup = BeautifulSoup(result)
  21.         tableau = soup.findAll('table')
  22.     try:
  23.         rows = tableau[3].findAll('tr')
  24.         print("Fetching")
  25.         for tr in rows:
  26.         cols = tr.findAll('td')
  27.         y = 0
  28.         x = x + 1
  29.         for td in cols:
  30.             texte_bu = td.text
  31.             texte_bu = texte_bu.encode('utf-8')
  32.             #print("Writing...")
  33.                     #print texte_bu
  34.             ws.write(x,y,td.text)
  35.             y = y + 1
  36.     except(IndexError):
  37.         print("No IA for this country")
  38.         pass
  39.        
  40.     reactor.stop()
  41.  
  42. CountryList = ['Test_Value','China ','UK - Scotland','Spain','Poland','Pakistan','Ireland','Hong Kong','Greece','Vietnam','United States of America', 'United Arab Emirates', 'UK - England', 'Turkey', 'Thailand', 'Taiwan', 'Singapore', 'Switzerland', 'Philippines', 'Netherlands', 'New Zealand', 'Malaysia', 'Italy', 'Finland', 'France', 'India', 'Canada', 'Australia', 'Germany', 'Republic of Korea','Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan', 'Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Burma','Cambodia','Cayman Islands','Chile','Colombia','Cook Islands','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','East Timor','Ecuador','Egypt','El Salvador','Eritrea','Estonia','Ethiopia','Fiji','Georgia','Ghana','Global','Haiti','Hungary','Iceland','Indonesia','Iran','Iraq','Israel','Israel, the Gaza Strip and the West Bank', 'Istanbul','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kingdom of Saudi Arabia','Kiribat','Kuwait','Kyrgyz Republic','Laos','Latvia','Lebanon','Libya','Lithuania','Luxembourg','Macedonia','Malta','Mardan','Mauritius','Mexico','Moldova','Monaco','Mongolia','Morocco','Nauru','Nepal','New Caledonia','Nigeria','Niue','Norway','Oman','Palau','Papua New Guinea','Peru','Portugal','Puerto Rico','Qatar','Korea','Romania','Russia','Saarland','Samoa','Saudi Arabia','Serbia','Slovakia','Slovenia','Solomon Islands','South Africa','South Korea','Sri Lanka','St. Kitts','St. Vincent and the Grenadines','Sweden','Syria','Tahiti','Taiwan, Republic of China','Tajikistan','The Netherlands','Tonga','Trinidad and Tobago','Tristan da Cunha','Tunisia','Turkmenistan','Uganda','UK - Northern Ireland','UK - Wales','Ukraine','Uruguay','Uzbekistan','Vanuatu','Various countries','Venezuela','Viet Nam','West Indies','Yemen']
  43.  
  44. #d = getPage(url, method='POST', postdata="hello, world, or whatever.")
  45. waiting = [client.getPage("http://bandscore.ielts.org/search.aspx", method='POST', postdata={
  46.                     '__VIEWSTATE' : '/wEPDwUKMTkwMjE5NDIwNg9kFgICAw9kFgQCAQ8QDxYCHgtfIURhdGFCb3VuZGdkDxaZAQIBAgICAwIEAgUCBgIHAggCCQIKAgsCDAINAg4CDwIQAhECEgITAhQCFQIWAhcCGAIZAhoCGwIcAh0CHgIfAiACIQIiAiMCJAIlAiYCJwIoAikCKgIrAiwCLQIuAi8CMAIxAjICMwI0AjUCNgI3AjgCOQI6AjsCPAI9Aj4CPwJAAkECQgJDAkQCRQJGAkcCSAJJAkoCSwJMAk0CTgJPAlACUQJSAlMCVAJVAlYCVwJYAlkCWgJbAlwCXQJeAl8CYAJhAmICYwJkAmUCZgJnAmgCaQJqAmsCbAJtAm4CbwJwAnECcgJzAnQCdQJ2AncCeAJ5AnoCewJ8An0CfgJ/AoABAoEBAoIBAoMBAoQBAoUBAoYBAocBAogBAokBAooBAosBAowBAo0BAo4BAo8BApABApEBApIBApMBApQBApUBApYBApcBApgBApkBFpkBEAULQWZnaGFuaXN0YW4FC0FmZ2hhbmlzdGFuZxAFB0FsYmFuaWEFB0FsYmFuaWFnEAUHQWxnZXJpYQUHQWxnZXJpYWcQBQlBcmdlbnRpbmEFCUFyZ2VudGluYWcQBQdBcm1lbmlhBQdBcm1lbmlhZxAFCUF1c3RyYWxpYQUJQXVzdHJhbGlhZxAFB0F1c3RyaWEFB0F1c3RyaWFnEAUKQXplcmJhaWphbgUKQXplcmJhaWphbmcQBQdCYWhyYWluBQdCYWhyYWluZxAFCkJhbmdsYWRlc2gFCkJhbmdsYWRlc2hnEAUHQmVsZ2l1bQUHQmVsZ2l1bWcQBQZCZWxpemUFBkJlbGl6ZWcQBQdCb2xpdmlhBQdCb2xpdmlhZxAFFkJvc25pYSBhbmQgSGVyemVnb3ZpbmEFFkJvc25pYSBhbmQgSGVyemVnb3ZpbmFnEAUGQnJhemlsBQZCcmF6aWxnEAURQnJ1bmVpIERhcnVzc2FsYW0FEUJydW5laSBEYXJ1c3NhbGFtZxAFCEJ1bGdhcmlhBQhCdWxnYXJpYWcQBQVCdXJtYQUFQnVybWFnEAUIQ2FtYm9kaWEFCENhbWJvZGlhZxAFBkNhbmFkYQUGQ2FuYWRhZxAFDkNheW1hbiBJc2xhbmRzBQ5DYXltYW4gSXNsYW5kc2cQBQVDaGlsZQUFQ2hpbGVnEAUGQ2hpbmEgBQZDaGluYSBnEAUIQ29sb21iaWEFCENvbG9tYmlhZxAFDENvb2sgSXNsYW5kcwUMQ29vayBJc2xhbmRzZxAFDkPDtHRlIGQnSXZvaXJlBQ5Dw7R0ZSBkJ0l2b2lyZWcQBQdDcm9hdGlhBQdDcm9hdGlhZxAFBEN1YmEFBEN1YmFnEAUGQ3lwcnVzBQZDeXBydXNnEAUOQ3plY2ggUmVwdWJsaWMFDkN6ZWNoIFJlcHVibGljZxAFB0Rlbm1hcmsFB0Rlbm1hcmtnEAUSRG9taW5pY2FuIFJlcHVibGljBRJEb21pbmljYW4gUmVwdWJsaWNnEAUKRWFzdCBUaW1vcgUKRWFzdCBUaW1vcmcQBQdFY3VhZG9yBQdFY3VhZG9yZxAFBUVneXB0BQVFZ3lwdGcQBQtFbCBTYWx2YWRvcgULRWwgU2FsdmFkb3JnEAUHRXJpdHJlYQUHRXJpdHJlYWcQBQdFc3RvbmlhBQdFc3RvbmlhZxAFCEV0aGlvcGlhBQhFdGhpb3BpYWcQBQRGaWppBQRGaWppZxAFB0ZpbmxhbmQFB0ZpbmxhbmRnEAUGRnJhbmNlBQZGcmFuY2VnEAUHR2VvcmdpYQUHR2VvcmdpYWcQBQdHZXJtYW55BQdHZXJtYW55ZxAFBUdoYW5hBQVHaGFuYWcQBQZHbG9iYWwFBkdsb2JhbGcQBQZHcmVlY2UFBkdyZWVjZWcQBQVIYWl0aQUFSGFpdGlnEAUJSG9uZyBLb25nBQlIb25nIEtvbmdnEAUHSHVuZ2FyeQUHSHVuZ2FyeWcQBQdJY2VsYW5kBQdJY2VsYW5kZxAFBUluZGlhBQVJbmRpYWcQBQlJbmRvbmVzaWEFCUluZG9uZXNpYWcQBQRJcmFuBQRJcmFuZxAFBElyYXEFBElyYXFnEAUHSXJlbGFuZAUHSXJlbGFuZGcQBQZJc3JhZWwFBklzcmFlbGcQBSlJc3JhZWwsIHRoZSBHYXphIFN0cmlwIGFuZCB0aGUgV2VzdCBCYW5rIAUpSXNyYWVsLCB0aGUgR2F6YSBTdHJpcCBhbmQgdGhlIFdlc3QgQmFuayBnEAUISXN0YW5idWwFCElzdGFuYnVsZxAFBUl0YWx5BQVJdGFseWcQBQdKYW1haWNhBQdKYW1haWNhZxAFBUphcGFuBQVKYXBhbmcQBQZKb3JkYW4FBkpvcmRhbmcQBQpLYXpha2hzdGFuBQpLYXpha2hzdGFuZxAFBUtlbnlhBQVLZW55YWcQBRdLaW5nZG9tIG9mIFNhdWRpIEFyYWJpYQUXS2luZ2RvbSBvZiBTYXVkaSBBcmFiaWFnEAUIS2lyaWJhdGkFCEtpcmliYXRpZxAFBUtvcmVhBQVLb3JlYWcQBQZLdXdhaXQFBkt1d2FpdGcQBQ9LeXJneXogUmVwdWJsaWMFD0t5cmd5eiBSZXB1YmxpY2cQBQRMYW9zBQRMYW9zZxAFBkxhdHZpYQUGTGF0dmlhZxAFB0xlYmFub24FB0xlYmFub25nEAUFTGlieWEFBUxpYnlhZxAFCUxpdGh1YW5pYQUJTGl0aHVhbmlhZxAFCkx1eGVtYm91cmcFCkx1eGVtYm91cmdnEAUJTWFjZWRvbmlhBQlNYWNlZG9uaWFnEAUITWFsYXlzaWEFCE1hbGF5c2lhZxAFBU1hbHRhBQVNYWx0YWcQBQZNYXJkYW4FBk1hcmRhbmcQBQlNYXVyaXRpdXMFCU1hdXJpdGl1c2cQBQZNZXhpY28FBk1leGljb2cQBQdNb2xkb3ZhBQdNb2xkb3ZhZxAFBk1vbmFjbwUGTW9uYWNvZxAFCE1vbmdvbGlhBQhNb25nb2xpYWcQBQdNb3JvY2NvBQdNb3JvY2NvZxAFBU5hdXJ1BQVOYXVydWcQBQVOZXBhbAUFTmVwYWxnEAULTmV0aGVybGFuZHMFC05ldGhlcmxhbmRzZxAFDU5ldyBDYWxlZG9uaWEFDU5ldyBDYWxlZG9uaWFnEAULTmV3IFplYWxhbmQFC05ldyBaZWFsYW5kZxAFB05pZ2VyaWEFB05pZ2VyaWFnEAUETml1ZQUETml1ZWcQBQZOb3J3YXkFBk5vcndheWcQBQRPbWFuBQRPbWFuZxAFCFBha2lzdGFuBQhQYWtpc3RhbmcQBQVQYWxhdQUFUGFsYXVnEAUQUGFwdWEgTmV3IEd1aW5lYQUQUGFwdWEgTmV3IEd1aW5lYWcQBQRQZXJ1BQRQZXJ1ZxAFC1BoaWxpcHBpbmVzBQtQaGlsaXBwaW5lc2cQBQZQb2xhbmQFBlBvbGFuZGcQBQhQb3J0dWdhbAUIUG9ydHVnYWxnEAULUHVlcnRvIFJpY28FC1B1ZXJ0byBSaWNvZxAFBVFhdGFyBQVRYXRhcmcQBRFSZXB1YmxpYyBvZiBLb3JlYQURUmVwdWJsaWMgb2YgS29yZWFnEAUHUm9tYW5pYQUHUm9tYW5pYWcQBQZSdXNzaWEFBlJ1c3NpYWcQBQhTYWFybGFuZAUIU2FhcmxhbmRnEAUFU2Ftb2EFBVNhbW9hZxAFDFNhdWRpIEFyYWJpYQUMU2F1ZGkgQXJhYmlhZxAFBlNlcmJpYQUGU2VyYmlhZxAFCVNpbmdhcG9yZQUJU2luZ2Fwb3JlZxAFCFNsb3Zha2lhBQhTbG92YWtpYWcQBQhTbG92ZW5pYQUIU2xvdmVuaWFnEAUPU29sb21vbiBJc2xhbmRzBQ9Tb2xvbW9uIElzbGFuZHNnEAUMU291dGggQWZyaWNhBQxTb3V0aCBBZnJpY2FnEAULU291dGggS29yZWEFC1NvdXRoIEtvcmVhZxAFBVNwYWluBQVTcGFpbmcQBQlTcmkgTGFua2EFCVNyaSBMYW5rYWcQBQlTdC4gS2l0dHMFCVN0LiBLaXR0c2cQBR5TdC4gVmluY2VudCBhbmQgdGhlIEdyZW5hZGluZXMFHlN0LiBWaW5jZW50IGFuZCB0aGUgR3JlbmFkaW5lc2cQBQZTd2VkZW4FBlN3ZWRlbmcQBQtTd2l0emVybGFuZAULU3dpdHplcmxhbmRnEAUFU3lyaWEFBVN5cmlhZxAFBlRhaGl0aQUGVGFoaXRpZxAFBlRhaXdhbgUGVGFpd2FuZxAFGVRhaXdhbiwgUmVwdWJsaWMgb2YgQ2hpbmEFGVRhaXdhbiwgUmVwdWJsaWMgb2YgQ2hpbmFnEAUKVGFqaWtpc3RhbgUKVGFqaWtpc3RhbmcQBQhUaGFpbGFuZAUIVGhhaWxhbmRnEAUPVGhlIE5ldGhlcmxhbmRzBQ9UaGUgTmV0aGVybGFuZHNnEAUFVG9uZ2EFBVRvbmdhZxAFE1RyaW5pZGFkIGFuZCBUb2JhZ28FE1RyaW5pZGFkIGFuZCBUb2JhZ29nEAUQVHJpc3RhbiBkYSBDdW5oYQUQVHJpc3RhbiBkYSBDdW5oYWcQBQdUdW5pc2lhBQdUdW5pc2lhZxAFBlR1cmtleQUGVHVya2V5ZxAFDFR1cmttZW5pc3RhbgUMVHVya21lbmlzdGFuZxAFBlVnYW5kYQUGVWdhbmRhZxAFDFVLIC0gRW5nbGFuZAUMVUsgLSBFbmdsYW5kZxAFFVVLIC0gTm9ydGhlcm4gSXJlbGFuZAUVVUsgLSBOb3J0aGVybiBJcmVsYW5kZxAFDVVLIC0gU2NvdGxhbmQFDVVLIC0gU2NvdGxhbmRnEAUKVUsgLSBXYWxlcwUKVUsgLSBXYWxlc2cQBQdVa3JhaW5lBQdVa3JhaW5lZxAFFFVuaXRlZCBBcmFiIEVtaXJhdGVzBRRVbml0ZWQgQXJhYiBFbWlyYXRlc2cQBRhVbml0ZWQgU3RhdGVzIG9mIEFtZXJpY2EFGFVuaXRlZCBTdGF0ZXMgb2YgQW1lcmljYWcQBQdVcnVndWF5BQdVcnVndWF5ZxAFClV6YmVraXN0YW4FClV6YmVraXN0YW5nEAUHVmFudWF0dQUHVmFudWF0dWcQBRFWYXJpb3VzIGNvdW50cmllcwURVmFyaW91cyBjb3VudHJpZXNnEAUJVmVuZXp1ZWxhBQlWZW5lenVlbGFnEAUIVmlldCBOYW0FCFZpZXQgTmFtZxAFB1ZpZXRuYW0FB1ZpZXRuYW1nEAULV2VzdCBJbmRpZXMFC1dlc3QgSW5kaWVzZxAFBVllbWVuBQVZZW1lbmdkZAIPDzwrAA0AZBgCBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUKY21kU2VhcmNoeAUQZ2R2U2VhcmNoUmVzdWx0cw9nZLWgL+bo7mQGsIE+VBHwdq0Volr7''',
  47.                     '__PREVIOUSPAGE' : 'EyxfwBf2A7IOt7bJTFbykEKuB-ERzMDNOrfC9rKUImBkq5iE3PhnD2YwJnA7OB5jxPkbo600qGoBYLeqQeK1fbmQkfs1',
  48.                     '__EVENTVALIDATION' : '/wEWpgEC3fjr2Q4CneSP5QoC1Y/P1gICyb/KxQIC3O/xsAICgZrbzgYCqvqP0wkCxoeV5w8C+P231w8C/+u85QECrOjQ+w4Co7iw2A0Ck9rpugMChayScQLQyIOGBgK25qDaCgKtsuGNBQLkiqbGDQKfh4rhCQK+yNiFCwK/t76MDgKW/7aMAgLdt+HwCwKY6b+oCwKc6Yf9CgKCh6KdBALhmpqwBQKvl471CgK07vobAoCjhpAPAp++5MgHAsig5rQPAsHS0JkHAvS84PQHAuSL8PoJAoPF37cCAp+eidUNAuS47eIFAt65jOMHAsPJjOIOAu7HtrkBAt+ZpUECwLCt1gwCr7GNrwsCse3CyAkCzLnGkA0CyLH5pQsCgueEQAKinuLFBwLv66P4DQL4vJvSCQK/nvfWCwLMqLaRAQKp8ZOnAQLV8suWAgKRuryUDwKRurj3DQLMqOLsBgLXy/CXBAK7lbjKBQKLnL2iDwLmvaGeCAKyxLvCCwLduZRYAvbL5LMOAsq2hicCzfiCiwMChsiijAUCpuzZqg8CkcqAmAEChtjjugsCkbOh3gkCzu+X1AsC+uTkmggChY6J4AECxKW7qggCvoGtigsC16er7g4CyZmAhg8C3IHG6AECxa6KuA8C18r8vQ4C1974gwgCo/a+jgQCksvzxQgCjfmGggICgILo9wUCysXYxQ0C+YzTmwUC+76EXALz0v6GDwKw66r5CAK7l5X8BQL97o2+AgLei6qRBAK6ycySDwKbuoiUDwKQ+b6xBALSqY7ZDgLix43HAwKd0NODAQLEjdf7CAKnqfbZDgK3ya/0AQKGvf7eCAKA4Jz3CQL10aCmBAK3xpvACwKb/7bjCQK9qoGQAgKg0cjiCgLRwO3/CwKR1KDuDAKQhYeGDQLDs+yDCQL/s+CDCQKA19NDArzO6bgHAtD4zeUPAr++9fUHAvq/m/8FAu6z14cLAr3A+4EGAoSbjpQPAoGT8vwMAonU2LwOAvXDiaYLAvP0x+sHAoGQ6bgPAsOW4ucIAqKqoYoEAtj8hOkDAuj6wqAOAvG9o8oFAt3mpM4LAq361tIMAqjUlP0JAp3+34oLAve+udULApLz6TIC3Lje8w0Cr/KrzQICgtr4sgsC2uPGqwgCpKPKmgQC/sTelw0C77PHww0ChbGH+QsC5bKuswUCx6jx7AQCsIf1zwkCme3AqQ8Cr8rp8wUCgvTE2QsCkd2whQEC0d7vkwwCruWjpgkClMnJ1wMCirCk7g0CsLWlgg8CveqZxQkC4bX2zAkC76fjuQ8C28aX9QcCytnP7wwCwNm36QM3JE3YXgBSLhN/K/0A9f9zFw4oqw==',
  49.                     'DropDownList1'  : Country,
  50.                     'txtSearchInstitution'  : '',
  51.                     'hdnSearchText'  : '',
  52.                     'rdoFilter': '%25',
  53.                     'cmdSearchx.x'  : '0',
  54.                     'cmdSearchx.y'  : '0',
  55.                     }) for Country in CountryList]
  56.  
  57.  
  58.  
  59. #waiting = [client.getPage(url) for url in urls]
  60. defer.gatherResults(waiting).addCallback(finish)
  61.  
  62. reactor.run()
  63. wb.save("IALOL.xls")
  64. print "Elapsed Time: %s" % (time.time() - start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement