Advertisement
Guest User

Untitled

a guest
Dec 16th, 2017
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 30.85 KB | None | 0 0
  1. import re
  2. import lxml.html
  3. #from lxml import etree
  4. from lxml.html import builder as E
  5. import lxml.html.clean
  6. from urllib import request
  7. import urllib.parse
  8. from hashlib import sha1
  9. #import time.sleep
  10. from time import sleep
  11. import time
  12. import html.parser
  13.  
  14. h = html.parser.HTMLParser()
  15.  
  16. class Scraper(object):
  17.  
  18. def __init__(self, url, date="", conn=None, source = None): #, content):
  19. self.url = url
  20. self.conn = conn
  21. #self.content = content
  22. self.aid = self.url_to_aid(url)
  23. self.date = date
  24. self.source = source
  25.  
  26. def get_content(self, encoding='utf-8'):
  27. self.reconnect()
  28. if self.conn != None:
  29. self.conn.request("GET", self.url)
  30. sleep(0.5)
  31. res = self.conn.getresponse()
  32. if res.status != 200:
  33. #print("\r", self.url, res.status, res.reason)
  34. #sys.stdout.write("\n\n\r\n", self.url, res.status, res.reason)
  35. print(res.status, res.reason)
  36. print(self.url)
  37. self.reconnect()
  38. return False
  39.  
  40. self.content = res.read().decode(encoding).encode('utf-8').decode('utf-8').replace('\r', ' ') #replace clears residual \r from '\r\n' in html
  41. else:
  42. self.content = request.urlopen(self.url).read().decode(encoding).replace('\r', ' ')
  43. towrite = lxml.html.fromstring(self.content)
  44. if towrite is not None:
  45. self.doc = towrite
  46. return True
  47. else:
  48. return False
  49.  
  50. def reconnect(self):
  51. self.conn.close()
  52. self.conn.connect()
  53.  
  54. class ScraperKloop(Scraper):
  55. domain = "kmb3.kloop.kg"
  56. prefix = "kloop"
  57. rePagenum = re.compile("p(|age)=([0-9]*)")
  58. badClasses = ['vk-button', 'mailru-button', 'fb-share-button', 'odkl-button', 'twitter-horizontal', 'live-journal', 'google-buzz', 'mrc__share']
  59.  
  60.  
  61. def scraped(self):
  62. self.get_content()
  63. #print(self.doc)
  64. for el in self.doc.find_class('entrytext'):
  65. pass
  66. #return lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8'))).text_content()
  67. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  68. for className in self.badClasses:
  69. for el in cleaned.find_class(className):
  70. el.getparent().remove(el)
  71. #remove all h3 tags
  72. for badEl in cleaned.findall(".//h3"):
  73. badEl.getparent().remove(badEl)
  74.  
  75. return cleaned.text_content()
  76.  
  77. def url_to_aid(self, url):
  78. return self.rePagenum.search(url).groups()[1]
  79.  
  80. class ScraperAzattyk(Scraper):
  81. domain = "www.azattyk.org"
  82. prefix = "rferl"
  83. rePagecode = re.compile("\/([0-9]*)\.html?")
  84.  
  85. def scraped(self):
  86. if self.get_content():
  87. #print(self.doc)
  88. el = ""
  89. for el in self.doc.find_class('zoomMe'):
  90. pass
  91. if el == "":
  92. for ela in self.doc.find_class('boxwidget_part'):
  93. if "id" in ela.attrib:
  94. if ela.attrib['id'] == "descText":
  95. el = ela
  96. if el != "":
  97. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  98. for el in cleaned.find_class("embedded_content_object"):
  99. el.getparent().remove("embedded_content_object")
  100. #for className in self.badClasses:
  101. # for el in cleaned.find_class(className):
  102. # el.getparent().remove(el)
  103. #print(cleaned.text_content())
  104. return cleaned.text_content()
  105. else:
  106. return ""
  107.  
  108. def url_to_aid(self, url):
  109. if self.rePagecode.search(url):
  110. return self.rePagecode.search(url).groups()[0]
  111. else:
  112. return sha1(url.encode('utf-8')).hexdigest()
  113.  
  114. class ScraperAzattyq(Scraper):
  115. domain = "www.azattyq.org"
  116. prefix = "rferl"
  117. rePagecode = re.compile("\/([0-9]*)\.html?")
  118. rePagecode2 = re.compile("\?id=([0-9]*)")
  119.  
  120. def scraped(self):
  121. if self.get_content():
  122. #print(self.doc)
  123. el = ""
  124. #for el in self.doc.find_class('zoomMe'):
  125. #print(str(self.doc.find_class('introText')))
  126. if len(self.doc.find_class('article_txt_intro')) > 0:
  127. introels = self.doc.find_class('article_txt_intro')
  128. else:
  129. introels = None
  130. if len(self.doc.find_class('articleContent')) > 0:
  131. for el in self.doc.find_class('articleContent'):
  132. pass
  133. #print(str(el))
  134. if lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')) != "":
  135. #if el. is None:
  136. #if self.doc.get_element_by_id('introText'):
  137. try:
  138. el = self.doc.get_element_by_id('introText')
  139. except KeyError:
  140. pass
  141. #print("INTROTEXT")
  142. else:
  143. el = self.doc.get_element_by_id('introText')
  144. #print("INTROTEXT")
  145. #elif len(self.doc.find_class('introText')) > 0:
  146. # for el in self.doc.find_class('introText'):
  147. # pass
  148. #if el == "":
  149. # for ela in self.doc.find_class('boxwidget_part'):
  150. # if "id" in ela.attrib:
  151. # if ela.attrib['id'] == "descText":
  152. # el = ela
  153. if el != "" and el != None:
  154. #to_return = ""
  155. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  156. for style in cleaned.findall(".//style"):
  157. style.drop_tree()
  158. #for p in el.iter("p"):
  159. #cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el, pretty_print=True).decode('utf-8')))
  160. ##print(cleaned.text_content)
  161. ##cleaned = cleaned.
  162. #for br in cleaned.xpath('//br'):
  163. for br in cleaned.findall(".//br"):
  164. if br.tail:
  165. br.tail="\n"+br.tail
  166. else:
  167. br.tail="\n"
  168. #print(br.tail)
  169. #for p in cleaned.xpath('//p'):
  170. for p in cleaned.findall('.//p'):
  171. if p.tail:
  172. p.tail="\n"+p.tail
  173. else:
  174. p.tail="\n"
  175. #print(p)
  176. ##to_return += cleaned.text_content()+"\n"
  177. to_return = cleaned.text_content()+"\n"
  178. toadd = ""
  179. if introels is not None:
  180. for introel in introels:
  181. toadd += introel.text_content()+"\n"
  182. to_return = toadd + to_return
  183. #print(to_return)
  184. ##for className in self.badClasses:
  185. ## for el in cleaned.find_class(className):
  186. ## el.getparent().remove(el)
  187. ##print(cleaned.text_content())
  188. #return cleaned.text_content()
  189. to_return = re.sub('\n\n\n*', '\n', to_return)
  190. to_return = to_return.strip('\n')
  191. return to_return
  192. else:
  193. return ""
  194.  
  195. def url_to_aid(self, url):
  196. if self.rePagecode.search(url):
  197. idsofar = self.rePagecode.search(url).groups()[0]
  198. if idsofar!="330":
  199. return idsofar
  200. else:
  201. if self.rePagecode2.search(url):
  202. idsofar = self.rePagecode2.search(url).groups()[0]
  203. return idsofar
  204. else:
  205. return sha1(url.encode('utf-8')).hexdigest()
  206. else:
  207. return sha1(url.encode('utf-8')).hexdigest()
  208.  
  209. class ScraperTRT(Scraper):
  210. domain = "www.trtkyrgyz.com"
  211. prefix = "trt"
  212. rePagecode = re.compile("haberkodu=([0-9a-f\-]*)(.html)?")
  213.  
  214. def scraped(self):
  215. self.get_content()
  216. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(self.content))
  217. output = ""
  218. for el in cleaned.findall(".//p"):
  219. #for subel in el.getiterator():
  220. # output += subel.text
  221. output += el.text_content()
  222. #print(el.text)
  223. return output
  224.  
  225. #print(self.url)
  226.  
  227. def url_to_aid(self, url):
  228. return self.rePagecode.search(url).groups()[0]
  229.  
  230. class ScraperBBC(Scraper):
  231. domain = "www.bbc.co.uk"
  232. prefix = "bbc"
  233.  
  234. def scraped(self):
  235. self.get_content()
  236. el = None
  237. for el in self.doc.find_class('bodytext'):
  238. pass
  239. if el != None:
  240. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  241. return cleaned.text_content()
  242. else:
  243. return False
  244.  
  245. def url_to_aid(self, url):
  246. return sha1(url.encode('utf-8')).hexdigest()
  247.  
  248. class ScraperAlaman(Scraper):
  249. domain = "alamankg.org"
  250. prefix = "alaman"
  251. reArticleNum = re.compile("\/([0-9]*?)\/?$")
  252. reBadDomain = re.compile("^http://alaman\\.kg/(.*)")
  253.  
  254. def __init__(self, url):
  255. # Alaman's rss feed points urls in a domain that forwards to the correct domain but without the content; this logic corrects the domain
  256. if self.reBadDomain.match(url):
  257. self.url = self.reBadDomain.sub('http://alamankg.org/\g<1>', url)
  258. print(">> "+self.url)
  259. else:
  260. self.url = url
  261.  
  262. self.aid = self.url_to_aid(url)
  263.  
  264. def scraped(self):
  265. self.get_content()
  266. el = None
  267. for el in self.doc.find_class('viewnew'):
  268. pass
  269. if el != None:
  270. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  271. return cleaned.text_content()
  272. else:
  273. return False
  274.  
  275. def url_to_aid(self, url):
  276. return self.reArticleNum.search(url).groups()[0]
  277.  
  278. class ScraperAzathabar(Scraper):
  279. domain = "www.azathabar.com"
  280. prefix = "rferl"
  281. rePagecode = re.compile("\/([0-9]*)\.html?")
  282. rePagecode2 = re.compile("\?id=([0-9]*)")
  283.  
  284. def scraped(self):
  285. #FIXME: check
  286. if self.get_content():
  287. #print(self.doc)
  288. el = ""
  289.  
  290. dateEl = self.doc.find_class('article_date')
  291. if len(dateEl)==0:
  292. dateEl = self.doc.find_class('date')
  293. dateBlah = dateEl[0].text_content().strip('\r\n ')
  294. if re.match("[0-9]{2}\.[0-9]{2}\.[0-9]{4}", dateBlah):
  295. self.date = time.strftime('%Y-%m-%d', time.strptime(dateBlah, "%d.%m.%Y"))
  296. else:
  297. dateBlah = re.sub('.*([0-9]{2}\.[0-9]{2}\.[0-9]{4} [0-9]{2}:[0-9]{2}).*', '\\1', dateBlah)
  298. self.date = time.strftime('%Y-%m-%dT%H:%M', time.strptime(dateBlah, "%d.%m.%Y %H:%M"))
  299.  
  300. if len(self.doc.find_class('article_txt_intro')) > 0:
  301. introels = self.doc.find_class('article_txt_intro')
  302. else:
  303. introels = None
  304. if len(self.doc.find_class('articleContent')) > 0:
  305. for el in self.doc.find_class('articleContent'):
  306. pass
  307. #print(str(el))
  308. if len(el.find_class('zoomMe')) > 0:
  309. el = el.find_class('zoomMe')[0]
  310. if lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')) != "":
  311. #if el. is None:
  312. #if self.doc.get_element_by_id('introText'):
  313. try:
  314. el = self.doc.get_element_by_id('introText')
  315. except KeyError:
  316. pass
  317. #print("INTROTEXT")
  318. else:
  319. el = self.doc.get_element_by_id('introText')
  320.  
  321. #for el in self.doc.find_class('zoomMe'):
  322. # pass
  323. #if el == "":
  324. # for ela in self.doc.find_class('boxwidget_part'):
  325. # if "id" in ela.attrib:
  326. # if ela.attrib['id'] == "descText":
  327. # el = ela
  328. if el != "":
  329. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  330. #for className in self.badClasses:
  331. # for el in cleaned.find_class(className):
  332. # el.getparent().remove(el)
  333. #print(cleaned.text_content())
  334. #return cleaned.text_content()
  335. for style in cleaned.findall(".//style"):
  336. style.drop_tree()
  337. for br in cleaned.findall(".//br"):
  338. if br.tail:
  339. br.tail="\n"+br.tail
  340. else:
  341. br.tail="\n"
  342. for p in cleaned.findall('.//p'):
  343. if p.tail:
  344. p.tail="\n"+p.tail
  345. else:
  346. p.tail="\n"
  347. to_return = cleaned.text_content()+"\n"
  348. toadd = ""
  349. if introels is not None:
  350. for introel in introels:
  351. toadd += introel.text_content()+"\n"
  352. to_return = toadd + to_return
  353.  
  354. to_return = re.sub('\r\n*', '\n', to_return)
  355. to_return = re.sub('\n\n\n*', '\n', to_return)
  356. #print(to_return)
  357. to_return = to_return.strip('\n \t')
  358. return to_return
  359.  
  360. else:
  361. return ""
  362.  
  363. def url_to_aid(self, url):
  364. if self.rePagecode.search(url):
  365. idsofar = self.rePagecode.search(url).groups()[0]
  366. if len(idsofar) > 4:
  367. return idsofar
  368. else:
  369. idsofar = self.rePagecode2.search(url).groups()[0]
  370. return idsofar
  371. else:
  372. return sha1(url.encode('utf-8')).hexdigest()
  373.  
  374. class ScraperOlloo(Scraper):
  375. domain = "www.olloo.mn"
  376. prefix = "olloo"
  377.  
  378. def scraped(self):
  379. self.get_content('cp1251')
  380. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class('content')[1]).decode('utf-8')))
  381. cleaned = cleaned.text_content()
  382. cleaned = h.unescape(cleaned)
  383. cleaned = h.unescape(cleaned).replace("\r", "") #remove extra carriage returns
  384. cleaned = cleaned.replace('V', 'Ү')
  385. cleaned = cleaned.replace('v', 'ү')
  386. cleaned = cleaned.replace('Є', 'Ө')
  387. cleaned = cleaned.replace('є', 'ө')
  388. return cleaned
  389.  
  390. def url_to_aid(self, url):
  391. uid = url.split("&sid=")[1]
  392. if uid is not None:
  393. return uid
  394. else:
  395. return sha1(url.encode('utf-8')).hexdigest()
  396.  
  397. class ScraperBolod(Scraper):
  398. domain = "www.bolod.mn"
  399. prefix = "bolod"
  400.  
  401. def scraped(self):
  402. self.get_content()
  403. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.xpath("//div[@align='justify']")[0]).decode('utf-8')))
  404. cleaned = cleaned.text_content()
  405. cleaned = h.unescape(cleaned).replace("\r", "") #remove extra carriage returns
  406. return cleaned.strip()
  407.  
  408. def url_to_aid(self, url):
  409. uid = url.split("&nID=")[1]
  410. if uid is not None:
  411. return uid
  412. else:
  413. return sha1(url.encode('utf-8')).hexdigest()
  414.  
  415. class ScraperNewsmn(Scraper):
  416. domain = "www.news.mn"
  417. prefix = "newsmn"
  418.  
  419. def scraped(self):
  420. self.get_content()
  421. if len(self.doc.xpath("//div[@style='text-align: justify;']")) is not 0:
  422. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.xpath("//div[@style='text-align: justify;']")[0]).decode('utf-8'))).text_content().strip()
  423. elif len(self.doc.xpath("//p[@style='text-align: justify;']")) is not 0:
  424. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.xpath("//p[@style='text-align: justify;']")[0]).decode('utf-8'))).text_content().strip()
  425. elif len(self.doc.find_class("text")) is not 0:
  426. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class("text")[0]).decode('utf-8'))).text_content().strip()
  427. elif len(self.doc.find_class("read-bd-body")) is not 0:
  428. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class("read-bd-body")[0]).decode('utf-8'))).text_content().strip()
  429. return content
  430.  
  431. def url_to_aid(self, url):
  432. endUrl = url.split('content/')[1]
  433. aid = endUrl[:endUrl.find('.shtml')]
  434. if aid is not None:
  435. return aid
  436. else:
  437. return sha1(url.encode('utf-8')).hexdigest()
  438.  
  439. class ScraperAzatutyun(Scraper):
  440. domain = "www.azatutyun.am"
  441. prefix = "azatutyun"
  442. badClasses = ['mediaplayer audioplayer','cannot-play j_errorContainer', 'downloadlinkstatic', 'playlistlink', 'expandMediaClose']
  443.  
  444. def scraped(self):
  445. self.get_content()
  446. contentFinal = ""
  447. for zoomMeTag in self.doc.find_class("zoomMe"):
  448. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(zoomMeTag).decode('utf-8')))
  449. if len(content.text_content()) > len(contentFinal.text_content() if contentFinal != "" else contentFinal):
  450. contentFinal = content
  451. for className in self.badClasses:
  452. for el in contentFinal.find_class(className):
  453. el.getparent().remove(el)
  454. return contentFinal.text_content().strip()
  455.  
  456. def url_to_aid(self, url):
  457. endUrl = url.split('article/')[1]
  458. aid = endUrl[:endUrl.find('.html')]
  459. if aid is not None:
  460. return aid
  461. else:
  462. return sha1(url.encode('utf-8')).hexdigest()
  463.  
  464. class ScraperChuvash(Scraper):
  465. domain = "www.chuvash.org"
  466. prefix = "chuvash"
  467.  
  468. def scraped(self):
  469. self.get_content()
  470. for el in self.doc.xpath("//span[@style='font-family:Verdana;font-size:10px;']"):
  471. el.getparent().remove(el)
  472. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class("hipar_text")[0]).decode('utf-8'))).text_content().strip()
  473. if self.source is not None:
  474. self.source.title = self.doc.xpath("//span[@style='color:#af2900;']")[0].text_content()
  475. try:
  476. date = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class("tags")[0]).decode('utf-8'))).text_content().strip()
  477. date = re.findall('[0-9]{2}\.[0-9]{2}\.[0-9]{4}', date)[0]
  478. self.date = time.strftime('%Y-%m-%d', time.strptime(date, "%d.%m.%Y"))
  479. except:
  480. self.date = None
  481. return content
  482.  
  483. def url_to_aid(self, url):
  484. aid = url.split('news/')[1].replace('.html','')
  485. if aid is not None:
  486. return aid
  487. else:
  488. return sha1(url.encode('utf-8')).hexdigest()
  489.  
  490. class ScraperAzadliq(Scraper):
  491. domain = "www.azadliq.org"
  492. prefix = "azadliq"
  493. badClasses = ['mediaplayer audioplayer','cannot-play j_errorContainer', 'downloadlinkstatic', 'playlistlink', 'expandMediaClose']
  494.  
  495. def scraped(self):
  496. self.get_content()
  497. contentFinal = ""
  498. for zoomMeTag in self.doc.find_class("zoomMe"):
  499. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(zoomMeTag).decode('utf-8')))
  500. if len(content.text_content()) > len(contentFinal.text_content() if contentFinal != "" else contentFinal):
  501. contentFinal = content
  502. for className in self.badClasses:
  503. for el in contentFinal.find_class(className):
  504. el.getparent().remove(el)
  505. return contentFinal.text_content().strip()
  506.  
  507. def url_to_aid(self, url):
  508. if 'archive/news' in url:
  509. aid = url.split('?id=')[1]
  510. else:
  511. urlSplit = re.findall('[0-9]{6,8}\.html$', url)
  512. if len(urlSplit):
  513. aid = urlSplit[0].replace('.html','')
  514. if aid is not None:
  515. return aid
  516. else:
  517. return sha1(url.encode('utf-8')).hexdigest()
  518.  
  519. class ScraperHypar(Scraper):
  520. domain = "www.hypar.ru"
  521. prefix = "hypar"
  522.  
  523. def scraped(self):
  524. self.get_content(encoding = 'cp1251')
  525. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.find_class("txt_p")[0]).decode('utf-8'))).text_content().strip()
  526. return content
  527.  
  528. def url_to_aid(self, url):
  529. aid = url.split('32/')[1].replace(r'/index.php','')
  530. if aid is not None:
  531. return aid
  532. else:
  533. return sha1(url.encode('utf-8')).hexdigest()
  534.  
  535. class ScraperOzodlik(Scraper):
  536. domain = "www.ozodlik.org"
  537. prefix = "uzb.rferl"
  538. rePagecode = re.compile("\/([0-9]*)\.html?")
  539. rePagecode2 = re.compile("\?id=([0-9]*)")
  540.  
  541. def scraped(self):
  542. #FIXME: check
  543. if self.get_content():
  544. #print(self.doc)
  545. el = ""
  546.  
  547. self.get_date()
  548.  
  549. if len(self.doc.find_class('article_txt_intro')) > 0:
  550. introels = self.doc.find_class('article_txt_intro')
  551. else:
  552. introels = None
  553. if len(self.doc.find_class('articleContent')) > 0:
  554. for el in self.doc.find_class('articleContent'):
  555. pass
  556. #print(str(el))
  557. if len(el.find_class('zoomMe')) > 0:
  558. el = el.find_class('zoomMe')[0]
  559. if lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')) != "":
  560. #if el. is None:
  561. #if self.doc.get_element_by_id('introText'):
  562. try:
  563. el = self.doc.get_element_by_id('introText')
  564. except KeyError:
  565. pass
  566. #print("INTROTEXT")
  567. else:
  568. el = self.doc.get_element_by_id('introText')
  569.  
  570. #for el in self.doc.find_class('zoomMe'):
  571. # pass
  572. #if el == "":
  573. # for ela in self.doc.find_class('boxwidget_part'):
  574. # if "id" in ela.attrib:
  575. # if ela.attrib['id'] == "descText":
  576. # el = ela
  577. if el != "":
  578. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  579. #for className in self.badClasses:
  580. # for el in cleaned.find_class(className):
  581. # el.getparent().remove(el)
  582. #print(cleaned.text_content())
  583. #return cleaned.text_content()
  584. for videodownload in cleaned.find_class("downloadvideoico"):
  585. videodownload.drop_tree()
  586. for style in cleaned.findall(".//style"):
  587. style.drop_tree()
  588. for br in cleaned.findall(".//br"):
  589. if br.tail:
  590. br.tail="\n"+br.tail
  591. else:
  592. br.tail="\n"
  593. for p in cleaned.findall('.//p'):
  594. if p.tail:
  595. p.tail="\n"+p.tail
  596. else:
  597. p.tail="\n"
  598. to_return = cleaned.text_content()+"\n"
  599. toadd = ""
  600. if introels is not None:
  601. for introel in introels:
  602. toadd += introel.text_content()+"\n"
  603. to_return = toadd + to_return
  604.  
  605. to_return = re.sub('\r\n*', '\n', to_return)
  606. to_return = re.sub('\n\n\n*', '\n', to_return)
  607. #print(to_return)
  608. to_return = to_return.strip('\n \t')
  609. return to_return
  610.  
  611. else:
  612. return ""
  613.  
  614. def get_date(self):
  615. dateEl = self.doc.find_class('article_date')
  616. if len(dateEl)==0:
  617. dateEl = self.doc.find_class('date')
  618. dateBlah = dateEl[0].text_content().strip('\r\n ')
  619. if re.match("^[0-9]{2}\.[0-9]{2}\.[0-9]{4}$", dateBlah):
  620. self.date = time.strftime('%Y-%m-%d', time.strptime(dateBlah, "%d.%m.%Y"))
  621. else:
  622. dateBlah = re.sub('.*([0-9]{2}\.[0-9]{2}\.[0-9]{4} [0-9]{2}:[0-9]{2}).*', '\\1', dateBlah)
  623. self.date = time.strftime('%Y-%m-%dT%H:%M', time.strptime(dateBlah, "%d.%m.%Y %H:%M"))
  624.  
  625. def url_to_aid(self, url):
  626. if self.rePagecode.search(url):
  627. idsofar = self.rePagecode.search(url).groups()[0]
  628. if len(idsofar) > 4:
  629. return idsofar
  630. else:
  631. idsofar = self.rePagecode2.search(url).groups()[0]
  632. return idsofar
  633. else:
  634. return sha1(url.encode('utf-8')).hexdigest()
  635.  
  636. class ScraperSvoboda(Scraper):
  637. domain = "www.radiosvoboda.org"
  638. prefix = "ukr.rferl"
  639. rePagecode = re.compile("\/([0-9]*)\.html?")
  640. rePagecode2 = re.compile("\?id=([0-9]*)")
  641.  
  642. def scraped(self):
  643. #FIXME: check
  644. if self.get_content():
  645. #print(self.doc)
  646. el = ""
  647.  
  648. self.get_date()
  649.  
  650. if len(self.doc.find_class('article_txt_intro')) > 0:
  651. introels = self.doc.find_class('article_txt_intro')
  652. else:
  653. introels = None
  654. if len(self.doc.find_class('articleContent')) > 0:
  655. for el in self.doc.find_class('articleContent'):
  656. pass
  657. #print(str(el))
  658. if len(el.find_class('zoomMe')) > 0:
  659. el = el.find_class('zoomMe')[0]
  660. if lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')) != "":
  661. #if el. is None:
  662. #if self.doc.get_element_by_id('introText'):
  663. try:
  664. el = self.doc.get_element_by_id('introText')
  665. except KeyError:
  666. pass
  667. #print("INTROTEXT")
  668. else:
  669. el = self.doc.get_element_by_id('introText')
  670.  
  671. #for el in self.doc.find_class('zoomMe'):
  672. # pass
  673. #if el == "":
  674. # for ela in self.doc.find_class('boxwidget_part'):
  675. # if "id" in ela.attrib:
  676. # if ela.attrib['id'] == "descText":
  677. # el = ela
  678. if el != "":
  679. cleaned = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(el).decode('utf-8')))
  680. #for className in self.badClasses:
  681. # for el in cleaned.find_class(className):
  682. # el.getparent().remove(el)
  683. #print(cleaned.text_content())
  684. #return cleaned.text_content()
  685. for videodownload in cleaned.find_class("downloadvideoico"):
  686. videodownload.drop_tree()
  687. for style in cleaned.findall(".//style"):
  688. style.drop_tree()
  689. for br in cleaned.findall(".//br"):
  690. if br.tail:
  691. br.tail="\n"+br.tail
  692. else:
  693. br.tail="\n"
  694. for p in cleaned.findall('.//p'):
  695. if p.tail:
  696. p.tail="\n"+p.tail
  697. else:
  698. p.tail="\n"
  699. to_return = cleaned.text_content()+"\n"
  700. toadd = ""
  701. if introels is not None:
  702. for introel in introels:
  703. toadd += introel.text_content()+"\n"
  704. to_return = toadd + to_return
  705.  
  706. to_return = re.sub('\r\n*', '\n', to_return)
  707. to_return = re.sub('\n\n\n*', '\n', to_return)
  708. #print(to_return)
  709. to_return = to_return.strip('\n \t')
  710. return to_return
  711.  
  712. else:
  713. return ""
  714.  
  715. def get_date(self):
  716. dateEl = self.doc.find_class('article_date')
  717. if len(dateEl)==0:
  718. dateEl = self.doc.find_class('date')
  719. dateBlah = dateEl[0].text_content().strip('\r\n ')
  720. if re.match("^[0-9]{2}\.[0-9]{2}\.[0-9]{4}$", dateBlah):
  721. self.date = time.strftime('%Y-%m-%d', time.strptime(dateBlah, "%d.%m.%Y"))
  722. else:
  723. dateBlah = re.sub('.*([0-9]{2}\.[0-9]{2}\.[0-9]{4} [0-9]{2}:[0-9]{2}).*', '\\1', dateBlah)
  724. self.date = time.strftime('%Y-%m-%dT%H:%M', time.strptime(dateBlah, "%d.%m.%Y %H:%M"))
  725.  
  726. def url_to_aid(self, url):
  727. if self.rePagecode.search(url):
  728. idsofar = self.rePagecode.search(url).groups()[0]
  729. if len(idsofar) > 4:
  730. return idsofar
  731. else:
  732. idsofar = self.rePagecode2.search(url).groups()[0]
  733. return idsofar
  734. else:
  735. return sha1(url.encode('utf-8')).hexdigest()
  736.  
  737. class ScraperAKumukia(Scraper):
  738. domain = "kumukia.ru"
  739. prefix = "kumukia"
  740. dateRe = re.compile("[0-9]{2}\.[0-9]{2}\.[0-9]{4}")
  741.  
  742. def scraped(self):
  743. self.get_content(encoding="utf-8")
  744. #self.doc.find_class("qa-content"):
  745. for el in self.doc.xpath("//p[@class='href']"):
  746. el.getparent().remove(el)
  747. #print(lxml.html.clean.clean_html(self.doc).text_content())
  748. if len(self.doc.find_class("qa-content"))>0:
  749. contentElement = self.doc.find_class("qa-content")[0]
  750. elif len(self.doc.xpath("//div[@id='qa-content']"))>0:
  751. contentElement = self.doc.xpath("//div[@id='qa-content']")[0]
  752. else:
  753. contentElement = self.doc.xpath("//div[@id='qa-content']")[0]
  754.  
  755. ### get rid of trailing junk ###
  756. found = False
  757. for el in contentElement:
  758. if el.attrib.get("class")=='qa-info' or found:
  759. #print(el)
  760. found = True
  761. el.drop_tree()
  762. if el.tag=="style":
  763. el.drop_tree()
  764. if self.source is not None:
  765. self.source.title = self.doc.xpath("//p[@class='title']")[0].text_content()
  766. try:
  767. #datetext = self.doc.find_class("qa-info")[0].decode('utf-8')
  768. datetext = self.doc.xpath("//div[@class='qa-info']")[0].text_content()
  769. #print("findall", re.findall("[0-9]{2}\.[0-9]{2}\.[0-9]{4}", datetext))
  770. #print("dateRe", self.dateRe.findall(datetext))
  771. date = self.dateRe.findall(datetext)[0]
  772. #print(date)
  773. self.date = time.strftime('%Y-%m-%d', time.strptime(date, "%d.%m.%Y"))
  774. except:
  775. self.date = None
  776.  
  777. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(contentElement).decode('utf-8'))).text_content().strip()
  778. return content
  779.  
  780. def url_to_aid(self, url):
  781. link = urllib.parse.unquote(url)
  782. aid = link.split('.ru/')[1].replace(r'.html','')
  783. if aid is not None:
  784. return aid
  785. else:
  786. return sha1(link.encode('utf-8')).hexdigest()
  787.  
  788.  
  789. class ScraperKumukiaAdab(Scraper):
  790. domain = "kumukia.ru/adabiat"
  791. prefix = "kumukia.adabiyat"
  792. aidRe = re.compile("work=([0-9]{1,3})\&page=([0-9]{1,3})")
  793. notRus = ['46', '45']
  794. noLang = ['60', '62', '63', '34']
  795. defRus = ['59', '32']
  796.  
  797. def scraped(self):
  798. self.get_content(encoding="windows-1251")
  799. #self.doc.find_class("qa-content"):
  800. (work, page) = self.work_and_page(self.url)
  801.  
  802. root = E.HTML(self.doc)
  803.  
  804. if work in self.notRus:
  805. rus = root.xpath("//*[@class='rus']")
  806. if rus:
  807. rus[0].attrib['class']='kum'
  808. qum = root.xpath("//*[@class='qum']")
  809. if qum:
  810. qum[0].attrib['class']='kum'
  811. elif work in self.noLang:
  812. root.attrib['class'] = 'kum'
  813. if work in self.defRus:
  814. for kum in root.xpath("//*[@class='kum']"):
  815. kum.attrib['class']='rus'
  816.  
  817. # get rid of menus
  818. for el in self.doc.xpath("//div[@class='topmenu']"):
  819. el.getparent().remove(el)
  820. # get rid of Russian text
  821. for el in self.doc.xpath("//*[@class='rus']"):
  822. el.getparent().remove(el)
  823.  
  824. #for el in self.doc.xpath("//a"):
  825. # el.getparent().remove(el)
  826. #print(lxml.html.clean.clean_html(self.doc).text_content())
  827. #contentElement = self.doc.find_class("qa-content")[0]
  828.  
  829. #kumykText = self.doc.xpath("//*[@class='kum']")
  830. #if kumykText:
  831. # contentElement = kumykText[0]
  832. #else:
  833. # # some pages just have class='rus'
  834. # return False
  835.  
  836. contentElement = E.HTML()
  837. # the good stuff all seems to be in <p/td class='kum'> nodes.
  838. # sometimes there's <pre class='kum'> within other class='kum' nodes;
  839. # so we need to be careful and just choose the good stuff
  840. for kumykText in self.doc.xpath("//p[@class='kum']|//td[@class='kum']"):
  841. contentElement.append(kumykText)
  842. #if contentElement is None:
  843. # # some pages just have class='rus'
  844. # return False
  845.  
  846. #contentElement = self.doc
  847.  
  848. ### get rid of trailing junk ###
  849. found = False
  850.  
  851. if self.source is not None:
  852. h3 = self.doc.xpath("//h3")
  853. if h3:
  854. self.source.title = h3[0].text_content()
  855. moreTitle = self.doc.xpath("//*[@class='title']")
  856. if moreTitle:
  857. if self.source.title != "":
  858. self.source.title += " - "
  859. self.source.title += moreTitle[0].text_content()
  860. self.date = None
  861.  
  862. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(contentElement).decode('utf-8'))).text_content().strip()
  863. return content
  864.  
  865. def work_and_page(self, url):
  866. matches = self.aidRe.search(url)
  867. if matches:
  868. (work, page) = (matches.group(1), matches.group(2))
  869. return (work, page)
  870. else:
  871. return (None, None)
  872.  
  873. def url_to_aid(self, url):
  874. (work, page) = self.work_and_page(url)
  875. if work and page:
  876. aid = "w%s+p%s" % (work, page)
  877. else: aid = None
  878.  
  879. if aid is not None:
  880. return aid
  881. else:
  882. return sha1(url.encode('utf-8')).hexdigest()
  883.  
  884. class ScraperErkinli(Scraper):
  885. domain = "www.radioerkinli.com"
  886. prefix = "erkinli"
  887. badClasses = ['mediaplayer audioplayer','cannot-play j_errorContainer', 'downloadlinkstatic', 'playlistlink', 'expandMediaClose']
  888.  
  889. def scraped(self):
  890. self.get_content()
  891. contentFinal = ""
  892. for zoomMeTag in self.doc.find_class("zoomMe"):
  893. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(zoomMeTag).decode('utf-8')))
  894. if len(content.text_content()) > len(contentFinal.text_content() if contentFinal != "" else contentFinal):
  895. contentFinal = content
  896. for className in self.badClasses:
  897. for el in contentFinal.find_class(className):
  898. el.getparent().remove(el)
  899. return contentFinal.text_content().strip()
  900.  
  901. def url_to_aid(self, url):
  902. if 'archive/news' in url:
  903. aid = url.split('?id=')[1]
  904. else:
  905. urlSplit = re.findall('[0-9]{6,8}\.html$', url)
  906. if len(urlSplit):
  907. aid = urlSplit[0].replace('.html','')
  908. if aid is not None:
  909. return aid
  910. else:
  911. return sha1(url.encode('utf-8')).hexdigest()
  912.  
  913. class ScraperKrymr(Scraper):
  914. domain = "ktat.krymr.com"
  915. prefix = "krymr"
  916. badClasses = ['mediaplayer audioplayer','cannot-play j_errorContainer', 'downloadlinkstatic', 'playlistlink', 'expandMediaClose']
  917.  
  918. def scraped(self):
  919. self.get_content()
  920. contentFinal = ""
  921. for zoomMeTag in self.doc.find_class("zoomMe"):
  922. content = lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(zoomMeTag).decode('utf-8')))
  923. if len(content.text_content()) > len(contentFinal.text_content() if contentFinal != "" else contentFinal):
  924. contentFinal = content
  925. for className in self.badClasses:
  926. for el in contentFinal.find_class(className):
  927. el.getparent().remove(el)
  928. return contentFinal.text_content().strip()
  929.  
  930. def url_to_aid(self, url):
  931. if 'archive/news' in url:
  932. aid = url.split('?id=')[1]
  933. else:
  934. urlSplit = re.findall('[0-9]{6,8}\.html$', url)
  935. if len(urlSplit):
  936. aid = urlSplit[0].replace('.html','')
  937. if aid is not None:
  938. return aid
  939. else:
  940. return sha1(url.encode('utf-8')).hexdigest()
  941.  
  942. class ScraperAltay(Scraper):
  943. domain = "altaicholmon.ru"
  944. prefix = "altaicholmon"
  945.  
  946. def scraped(self):
  947. self.get_content()
  948. lxml.html.document_fromstring(lxml.html.clean.clean_html(lxml.html.tostring(self.doc.xpath('//div[@class="padding_news text-color"]/p/text()')[0]).decode('utf-8')))
  949. cleaned = cleaned.text_content()
  950. return cleaned.strip()
  951.  
  952. def url_to_aid(self, url):
  953. return sha1(url.encode('utf-8')).hexdigest()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement