Guest User

Untitled

a guest
Mar 18th, 2018
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.88 KB | None | 0 0
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. from string import Template
  4. import time
  5. import random
  6. import json
  7.  
  8. schema = "CREATE TABLE IF NOT EXISTS body ( news_id TEXT, body TEXT CHARACTER SET utf8mb4 )"
  9. insertQuery = Template("INSERT INTO body (news_id, body) VALUES ('$news_id', '$body')")
  10.  
  11. def remove_tags(text):
  12. soup = BeautifulSoup(text, 'html.parser')
  13. return soup.get_text()
  14.  
  15.  
  16. # TODO: dump is too generic
  17. tableName = "dump"
  18. def fix(conn):
  19. cursor = conn.cursor()
  20. cursor.execute(schema)
  21.  
  22. cursor.execute(Template("SELECT L.id, L.src_link FROM $tableName AS L LEFT JOIN body AS R ON L.id=R.news_id WHERE L.source_name='CGTN' AND actual_source_name='APP' AND R.news_id IS NULL").substitute(tableName=tableName))
  23. # NOTE: for debugging
  24. # result = list(cursor.fetchall())
  25. # random.shuffle(result)
  26.  
  27. result = cursor.fetchall()
  28. total = len(result)
  29. counter = 0
  30. for (id, src_link) in result:
  31. counter += 1
  32. print(Template(("$counter / $total -> ")).substitute(total=total, counter=counter), id, src_link)
  33. print('##########################################')
  34. sauce = urllib.request.urlopen(src_link).read()
  35. soup = BeautifulSoup(sauce, 'html.parser')
  36. content = soup.find("div", class_="content")
  37. try:
  38. bodyText = ''
  39. jsonString = content["data-json"]
  40. for i in json.loads(jsonString):
  41. try:
  42. # NOTE: sometimes content is a dict for interactive stories
  43. if type(i["content"]) is str:
  44. bodyText += i["content"].strip()
  45. except KeyError:
  46. pass
  47. body = remove_tags(bodyText).replace('\'', '\\\'')
  48. print(body)
  49. insert = conn.cursor()
  50. insert.execute(insertQuery.substitute(news_id=id, body=body))
  51. conn.commit()
  52. except KeyError:
  53. insert = conn.cursor()
  54. insert.execute(insertQuery.substitute(news_id=id, body=""))
  55. conn.commit()
  56. pass
  57. print('')
  58. time.sleep(0.5)
Add Comment
Please, Sign In to add comment