SHARE
TWEET

Untitled

Carto_ Apr 24th, 2012 10 Never
  1. from twisted.web import client
  2. from twisted.internet import reactor, defer
  3. from bs4 import BeautifulSoup as BeautifulSoup
  4. import time
  5. import xlwt
  6.  
  7. start = time.time()
  8. wb = xlwt.Workbook(encoding='utf-8')
  9. ws = wb.add_sheet("BULATS_IA_PARSED")
  10. global x
  11. x = 0
  12. Countries_List = ['Afghanistan','Armenia','Brazil','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Cameroon','Canada','Central African Republic','Chile','China','Colombia','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','Ecuador','Egypt','Eritrea','Estonia','Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia','Georgia','Germany','Gibraltar','Greece','Grenada','Hong Kong','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kuwait','Latvia','Lebanon','Libya','Liechtenstein','Lithuania','Luxembourg','Macau','Macedonia','Malaysia','Maldives','Malta','Mexico','Monaco','Montenegro','Morocco','Mozambique','Myanmar (Burma)','Nepal','Netherlands','New Caledonia','New Zealand','Nigeria','Norway','Oman','Pakistan','Palestine','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Saudi Arabia','Serbia','Singapore','Slovakia','Slovenia','South Africa','South Korea','Spain','Sri Lanka','Sweden','Switzerland','Syria','Taiwan','Thailand','Trinadad and Tobago','Tunisia','Turkey','Ukraine','United Arab Emirates','United Kingdom','United States','Uruguay','Uzbekistan','Venezuela','Vietnam']
  13. urls = ["http://www.cambridgeesol.org/institutions/results.php?region=%s&type=&BULATS=on" % Countries for Countries in Countries_List]
  14.  
  15.  
  16. def finish(results):
  17.     global x
  18.     for result in results:
  19.         print 'GOT PAGE', len(result), 'bytes'
  20.         soup = BeautifulSoup(result)
  21.         tableau = soup.findAll('table')
  22.         try:
  23.             rows = tableau[3].findAll('tr')
  24.             print("Fetching")
  25.             for tr in rows:
  26.                 cols = tr.findAll('td')
  27.                 y = 0
  28.                 x = x + 1
  29.                 for td in cols:
  30.                     texte_bu = td.text
  31.                     texte_bu = texte_bu.encode('utf-8')
  32.                     #print("Writing...")
  33.                     #print texte_bu
  34.                     ws.write(x,y,td.text)
  35.                     y = y + 1
  36.         except(IndexError):
  37.             print("No IA for this country")
  38.             pass
  39.        
  40.     reactor.stop()
  41.  
  42. CountryList = ['Test_Value','China ','UK - Scotland','Spain','Poland','Pakistan','Ireland','Hong Kong','Greece','Vietnam','United States of America', 'United Arab Emirates', 'UK - England', 'Turkey', 'Thailand', 'Taiwan', 'Singapore', 'Switzerland', 'Philippines', 'Netherlands', 'New Zealand', 'Malaysia', 'Italy', 'Finland', 'France', 'India', 'Canada', 'Australia', 'Germany', 'Republic of Korea','Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan', 'Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Burma','Cambodia','Cayman Islands','Chile','Colombia','Cook Islands','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','East Timor','Ecuador','Egypt','El Salvador','Eritrea','Estonia','Ethiopia','Fiji','Georgia','Ghana','Global','Haiti','Hungary','Iceland','Indonesia','Iran','Iraq','Israel','Israel, the Gaza Strip and the West Bank', 'Istanbul','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kingdom of Saudi Arabia','Kiribat','Kuwait','Kyrgyz Republic','Laos','Latvia','Lebanon','Libya','Lithuania','Luxembourg','Macedonia','Malta','Mardan','Mauritius','Mexico','Moldova','Monaco','Mongolia','Morocco','Nauru','Nepal','New Caledonia','Nigeria','Niue','Norway','Oman','Palau','Papua New Guinea','Peru','Portugal','Puerto Rico','Qatar','Korea','Romania','Russia','Saarland','Samoa','Saudi Arabia','Serbia','Slovakia','Slovenia','Solomon Islands','South Africa','South Korea','Sri Lanka','St. Kitts','St. Vincent and the Grenadines','Sweden','Syria','Tahiti','Taiwan, Republic of China','Tajikistan','The Netherlands','Tonga','Trinidad and Tobago','Tristan da Cunha','Tunisia','Turkmenistan','Uganda','UK - Northern Ireland','UK - Wales','Ukraine','Uruguay','Uzbekistan','Vanuatu','Various countries','Venezuela','Viet Nam','West Indies','Yemen']
  43.  
  44. #d = getPage(url, method='POST', postdata="hello, world, or whatever.")
  45. waiting = [client.getPage("http://bandscore.ielts.org/search.aspx", method='POST', postdata={
  46.                     '__VIEWSTATE' : '/wEPDwUKMTkwMjE5NDIwNg9kFgICAw9kFgQCAQ8QDxYCHgtfIURhdGFCb3VuZGdkDxaZAQIBAgICAwIEAgUCBgIHAggCCQIKAgsCDAINAg4CDwIQAhECEgITAhQCFQIWAhcCGAIZAhoCGwIcAh0CHgIfAiACIQIiAiMCJAIlAiYCJwIoAikCKgIrAiwCLQIuAi8CMAIxAjICMwI0AjUCNgI3AjgCOQI6AjsCPAI9Aj4CPwJAAkECQgJDAkQCRQJGAkcCSAJJAkoCSwJMAk0CTgJPAlACUQJSAlMCVAJVAlYCVwJYAlkCWgJbAlwCXQJeAl8CYAJhAmICYwJkAmUCZgJnAmgCaQJqAmsCbAJtAm4CbwJwAnECcgJzAnQCdQJ2AncCeAJ5AnoCewJ8An0CfgJ/AoABAoEBAoIBAoMBAoQBAoUBAoYBAocBAogBAokBAooBAosBAowBAo0BAo4BAo8BApABApEBApIBApMBApQBApUBApYBApcBApgBApkBFpkBEAULQWZnaGFuaXN0YW4FC0FmZ2hhbmlzdGFuZxAFB0FsYmFuaWEFB0FsYmFuaWFnEAUHQWxnZXJpYQUHQWxnZXJpYWcQBQlBcmdlbnRpbmEFCUFyZ2VudGluYWcQBQdBcm1lbmlhBQdBcm1lbmlhZxAFCUF1c3RyYWxpYQUJQXVzdHJhbGlhZxAFB0F1c3RyaWEFB0F1c3RyaWFnEAUKQXplcmJhaWphbgUKQXplcmJhaWphbmcQBQdCYWhyYWluBQdCYWhyYWluZxAFCkJhbmdsYWRlc2gFCkJhbmdsYWRlc2hnEAUHQmVsZ2l1bQUHQmVsZ2l1bWcQBQZCZWxpemUFBkJlbGl6ZWcQBQdCb2xpdmlhBQdCb2xpdmlhZxAFFkJvc25pYSBhbmQgSGVyemVnb3ZpbmEFFkJvc25pYSBhbmQgSGVyemVnb3ZpbmFnEAUGQnJhemlsBQZCcmF6aWxnEAURQnJ1bmVpIERhcnVzc2FsYW0FEUJydW5laSBEYXJ1c3NhbGFtZxAFCEJ1bGdhcmlhBQhCdWxnYXJpYWcQBQVCdXJtYQUFQnVybWFnEAUIQ2FtYm9kaWEFCENhbWJvZGlhZxAFBkNhbmFkYQUGQ2FuYWRhZxAFDkNheW1hbiBJc2xhbmRzBQ5DYXltYW4gSXNsYW5kc2cQBQVDaGlsZQUFQ2hpbGVnEAUGQ2hpbmEgBQZDaGluYSBnEAUIQ29sb21iaWEFCENvbG9tYmlhZxAFDENvb2sgSXNsYW5kcwUMQ29vayBJc2xhbmRzZxAFDkPDtHRlIGQnSXZvaXJlBQ5Dw7R0ZSBkJ0l2b2lyZWcQBQdDcm9hdGlhBQdDcm9hdGlhZxAFBEN1YmEFBEN1YmFnEAUGQ3lwcnVzBQZDeXBydXNnEAUOQ3plY2ggUmVwdWJsaWMFDkN6ZWNoIFJlcHVibGljZxAFB0Rlbm1hcmsFB0Rlbm1hcmtnEAUSRG9taW5pY2FuIFJlcHVibGljBRJEb21pbmljYW4gUmVwdWJsaWNnEAUKRWFzdCBUaW1vcgUKRWFzdCBUaW1vcmcQBQdFY3VhZG9yBQdFY3VhZG9yZxAFBUVneXB0BQVFZ3lwdGcQBQtFbCBTYWx2YWRvcgULRWwgU2FsdmFkb3JnEAUHRXJpdHJlYQUHRXJpdHJlYWcQBQdFc3RvbmlhBQdFc3RvbmlhZxAFCEV0aGlvcGlhBQhFdGhpb3BpYWcQBQRGaWppBQRGaWppZxAFB0ZpbmxhbmQFB0ZpbmxhbmRnEAUGRnJhbmNlBQZGcmFuY2VnEAUHR2VvcmdpYQUHR2VvcmdpYWcQBQdHZXJtYW55BQdHZXJtYW55ZxAFBUdoYW5hBQVHaGFuYWcQBQZHbG9iYWwFBkdsb2JhbGcQBQZHcmVlY2UFBkdyZWVjZWcQBQVIYWl0aQUFSGFpdGlnEAUJSG9uZyBLb25nBQlIb25nIEtvbmdnEAUHSHVuZ2FyeQUHSHVuZ2FyeWcQBQdJY2VsYW5kBQdJY2VsYW5kZxAFBUluZGlhBQVJbmRpYWcQBQlJbmRvbmVzaWEFCUluZG9uZXNpYWcQBQRJcmFuBQRJcmFuZxAFBElyYXEFBElyYXFnEAUHSXJlbGFuZAUHSXJlbGFuZGcQBQZJc3JhZWwFBklzcmFlbGcQBSlJc3JhZWwsIHRoZSBHYXphIFN0cmlwIGFuZCB0aGUgV2VzdCBCYW5rIAUpSXNyYWVsLCB0aGUgR2F6YSBTdHJpcCBhbmQgdGhlIFdlc3QgQmFuayBnEAUISXN0YW5idWwFCElzdGFuYnVsZxAFBUl0YWx5BQVJdGFseWcQBQdKYW1haWNhBQdKYW1haWNhZxAFBUphcGFuBQVKYXBhbmcQBQZKb3JkYW4FBkpvcmRhbmcQBQpLYXpha2hzdGFuBQpLYXpha2hzdGFuZxAFBUtlbnlhBQVLZW55YWcQBRdLaW5nZG9tIG9mIFNhdWRpIEFyYWJpYQUXS2luZ2RvbSBvZiBTYXVkaSBBcmFiaWFnEAUIS2lyaWJhdGkFCEtpcmliYXRpZxAFBUtvcmVhBQVLb3JlYWcQBQZLdXdhaXQFBkt1d2FpdGcQBQ9LeXJneXogUmVwdWJsaWMFD0t5cmd5eiBSZXB1YmxpY2cQBQRMYW9zBQRMYW9zZxAFBkxhdHZpYQUGTGF0dmlhZxAFB0xlYmFub24FB0xlYmFub25nEAUFTGlieWEFBUxpYnlhZxAFCUxpdGh1YW5pYQUJTGl0aHVhbmlhZxAFCkx1eGVtYm91cmcFCkx1eGVtYm91cmdnEAUJTWFjZWRvbmlhBQlNYWNlZG9uaWFnEAUITWFsYXlzaWEFCE1hbGF5c2lhZxAFBU1hbHRhBQVNYWx0YWcQBQZNYXJkYW4FBk1hcmRhbmcQBQlNYXVyaXRpdXMFCU1hdXJpdGl1c2cQBQZNZXhpY28FBk1leGljb2cQBQdNb2xkb3ZhBQdNb2xkb3ZhZxAFBk1vbmFjbwUGTW9uYWNvZxAFCE1vbmdvbGlhBQhNb25nb2xpYWcQBQdNb3JvY2NvBQdNb3JvY2NvZxAFBU5hdXJ1BQVOYXVydWcQBQVOZXBhbAUFTmVwYWxnEAULTmV0aGVybGFuZHMFC05ldGhlcmxhbmRzZxAFDU5ldyBDYWxlZG9uaWEFDU5ldyBDYWxlZG9uaWFnEAULTmV3IFplYWxhbmQFC05ldyBaZWFsYW5kZxAFB05pZ2VyaWEFB05pZ2VyaWFnEAUETml1ZQUETml1ZWcQBQZOb3J3YXkFBk5vcndheWcQBQRPbWFuBQRPbWFuZxAFCFBha2lzdGFuBQhQYWtpc3RhbmcQBQVQYWxhdQUFUGFsYXVnEAUQUGFwdWEgTmV3IEd1aW5lYQUQUGFwdWEgTmV3IEd1aW5lYWcQBQRQZXJ1BQRQZXJ1ZxAFC1BoaWxpcHBpbmVzBQtQaGlsaXBwaW5lc2cQBQZQb2xhbmQFBlBvbGFuZGcQBQhQb3J0dWdhbAUIUG9ydHVnYWxnEAULUHVlcnRvIFJpY28FC1B1ZXJ0byBSaWNvZxAFBVFhdGFyBQVRYXRhcmcQBRFSZXB1YmxpYyBvZiBLb3JlYQURUmVwdWJsaWMgb2YgS29yZWFnEAUHUm9tYW5pYQUHUm9tYW5pYWcQBQZSdXNzaWEFBlJ1c3NpYWcQBQhTYWFybGFuZAUIU2FhcmxhbmRnEAUFU2Ftb2EFBVNhbW9hZxAFDFNhdWRpIEFyYWJpYQUMU2F1ZGkgQXJhYmlhZxAFBlNlcmJpYQUGU2VyYmlhZxAFCVNpbmdhcG9yZQUJU2luZ2Fwb3JlZxAFCFNsb3Zha2lhBQhTbG92YWtpYWcQBQhTbG92ZW5pYQUIU2xvdmVuaWFnEAUPU29sb21vbiBJc2xhbmRzBQ9Tb2xvbW9uIElzbGFuZHNnEAUMU291dGggQWZyaWNhBQxTb3V0aCBBZnJpY2FnEAULU291dGggS29yZWEFC1NvdXRoIEtvcmVhZxAFBVNwYWluBQVTcGFpbmcQBQlTcmkgTGFua2EFCVNyaSBMYW5rYWcQBQlTdC4gS2l0dHMFCVN0LiBLaXR0c2cQBR5TdC4gVmluY2VudCBhbmQgdGhlIEdyZW5hZGluZXMFHlN0LiBWaW5jZW50IGFuZCB0aGUgR3JlbmFkaW5lc2cQBQZTd2VkZW4FBlN3ZWRlbmcQBQtTd2l0emVybGFuZAULU3dpdHplcmxhbmRnEAUFU3lyaWEFBVN5cmlhZxAFBlRhaGl0aQUGVGFoaXRpZxAFBlRhaXdhbgUGVGFpd2FuZxAFGVRhaXdhbiwgUmVwdWJsaWMgb2YgQ2hpbmEFGVRhaXdhbiwgUmVwdWJsaWMgb2YgQ2hpbmFnEAUKVGFqaWtpc3RhbgUKVGFqaWtpc3RhbmcQBQhUaGFpbGFuZAUIVGhhaWxhbmRnEAUPVGhlIE5ldGhlcmxhbmRzBQ9UaGUgTmV0aGVybGFuZHNnEAUFVG9uZ2EFBVRvbmdhZxAFE1RyaW5pZGFkIGFuZCBUb2JhZ28FE1RyaW5pZGFkIGFuZCBUb2JhZ29nEAUQVHJpc3RhbiBkYSBDdW5oYQUQVHJpc3RhbiBkYSBDdW5oYWcQBQdUdW5pc2lhBQdUdW5pc2lhZxAFBlR1cmtleQUGVHVya2V5ZxAFDFR1cmttZW5pc3RhbgUMVHVya21lbmlzdGFuZxAFBlVnYW5kYQUGVWdhbmRhZxAFDFVLIC0gRW5nbGFuZAUMVUsgLSBFbmdsYW5kZxAFFVVLIC0gTm9ydGhlcm4gSXJlbGFuZAUVVUsgLSBOb3J0aGVybiBJcmVsYW5kZxAFDVVLIC0gU2NvdGxhbmQFDVVLIC0gU2NvdGxhbmRnEAUKVUsgLSBXYWxlcwUKVUsgLSBXYWxlc2cQBQdVa3JhaW5lBQdVa3JhaW5lZxAFFFVuaXRlZCBBcmFiIEVtaXJhdGVzBRRVbml0ZWQgQXJhYiBFbWlyYXRlc2cQBRhVbml0ZWQgU3RhdGVzIG9mIEFtZXJpY2EFGFVuaXRlZCBTdGF0ZXMgb2YgQW1lcmljYWcQBQdVcnVndWF5BQdVcnVndWF5ZxAFClV6YmVraXN0YW4FClV6YmVraXN0YW5nEAUHVmFudWF0dQUHVmFudWF0dWcQBRFWYXJpb3VzIGNvdW50cmllcwURVmFyaW91cyBjb3VudHJpZXNnEAUJVmVuZXp1ZWxhBQlWZW5lenVlbGFnEAUIVmlldCBOYW0FCFZpZXQgTmFtZxAFB1ZpZXRuYW0FB1ZpZXRuYW1nEAULV2VzdCBJbmRpZXMFC1dlc3QgSW5kaWVzZxAFBVllbWVuBQVZZW1lbmdkZAIPDzwrAA0AZBgCBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUKY21kU2VhcmNoeAUQZ2R2U2VhcmNoUmVzdWx0cw9nZLWgL+bo7mQGsIE+VBHwdq0Volr7''',
  47.                     '__PREVIOUSPAGE' : 'EyxfwBf2A7IOt7bJTFbykEKuB-ERzMDNOrfC9rKUImBkq5iE3PhnD2YwJnA7OB5jxPkbo600qGoBYLeqQeK1fbmQkfs1',
  48.                     '__EVENTVALIDATION' : '/wEWpgEC3fjr2Q4CneSP5QoC1Y/P1gICyb/KxQIC3O/xsAICgZrbzgYCqvqP0wkCxoeV5w8C+P231w8C/+u85QECrOjQ+w4Co7iw2A0Ck9rpugMChayScQLQyIOGBgK25qDaCgKtsuGNBQLkiqbGDQKfh4rhCQK+yNiFCwK/t76MDgKW/7aMAgLdt+HwCwKY6b+oCwKc6Yf9CgKCh6KdBALhmpqwBQKvl471CgK07vobAoCjhpAPAp++5MgHAsig5rQPAsHS0JkHAvS84PQHAuSL8PoJAoPF37cCAp+eidUNAuS47eIFAt65jOMHAsPJjOIOAu7HtrkBAt+ZpUECwLCt1gwCr7GNrwsCse3CyAkCzLnGkA0CyLH5pQsCgueEQAKinuLFBwLv66P4DQL4vJvSCQK/nvfWCwLMqLaRAQKp8ZOnAQLV8suWAgKRuryUDwKRurj3DQLMqOLsBgLXy/CXBAK7lbjKBQKLnL2iDwLmvaGeCAKyxLvCCwLduZRYAvbL5LMOAsq2hicCzfiCiwMChsiijAUCpuzZqg8CkcqAmAEChtjjugsCkbOh3gkCzu+X1AsC+uTkmggChY6J4AECxKW7qggCvoGtigsC16er7g4CyZmAhg8C3IHG6AECxa6KuA8C18r8vQ4C1974gwgCo/a+jgQCksvzxQgCjfmGggICgILo9wUCysXYxQ0C+YzTmwUC+76EXALz0v6GDwKw66r5CAK7l5X8BQL97o2+AgLei6qRBAK6ycySDwKbuoiUDwKQ+b6xBALSqY7ZDgLix43HAwKd0NODAQLEjdf7CAKnqfbZDgK3ya/0AQKGvf7eCAKA4Jz3CQL10aCmBAK3xpvACwKb/7bjCQK9qoGQAgKg0cjiCgLRwO3/CwKR1KDuDAKQhYeGDQLDs+yDCQL/s+CDCQKA19NDArzO6bgHAtD4zeUPAr++9fUHAvq/m/8FAu6z14cLAr3A+4EGAoSbjpQPAoGT8vwMAonU2LwOAvXDiaYLAvP0x+sHAoGQ6bgPAsOW4ucIAqKqoYoEAtj8hOkDAuj6wqAOAvG9o8oFAt3mpM4LAq361tIMAqjUlP0JAp3+34oLAve+udULApLz6TIC3Lje8w0Cr/KrzQICgtr4sgsC2uPGqwgCpKPKmgQC/sTelw0C77PHww0ChbGH+QsC5bKuswUCx6jx7AQCsIf1zwkCme3AqQ8Cr8rp8wUCgvTE2QsCkd2whQEC0d7vkwwCruWjpgkClMnJ1wMCirCk7g0CsLWlgg8CveqZxQkC4bX2zAkC76fjuQ8C28aX9QcCytnP7wwCwNm36QM3JE3YXgBSLhN/K/0A9f9zFw4oqw==',
  49.                     'DropDownList1'  : Country,
  50.                     'txtSearchInstitution'  : '',
  51.                     'hdnSearchText'  : '',
  52.                     'rdoFilter': '%25',
  53.                     'cmdSearchx.x'  : '0',
  54.                     'cmdSearchx.y'  : '0',
  55.                         }) for Country in CountryList]
  56.  
  57.  
  58.  
  59. #waiting = [client.getPage(url) for url in urls]
  60. defer.gatherResults(waiting).addCallback(finish)
  61.  
  62. reactor.run()
  63. wb.save("IALOL.xls")
  64. print "Elapsed Time: %s" % (time.time() - start)
RAW Paste Data
Top