Advertisement
Guest User

Untitled

a guest
Mar 16th, 2019
378
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.88 KB | None | 0 0
  1. import sys
  2. import os
  3. import re
  4. import datetime
  5.  
  6.  
  7. def get_email_list(name, domain_list, total_left):
  8. emails_left = total_left.copy()
  9. email_list = []
  10. for email in emails_left:
  11. if any(email.endswith(substring) for substring in domain_list):
  12. total_left.remove(email)
  13. email_list.append(email)
  14. language = (len(email_list), name, email_list)
  15. print("Total left {} : {} {} ".format(len(total_left), language[1], language[0]))
  16. return language, total_left
  17.  
  18.  
  19. def main():
  20. file_path = sys.argv[1]
  21.  
  22. if not os.path.isfile(file_path):
  23. print("File path {} Program Exiting..".format(
  24. file_path))
  25. sys.exit()
  26.  
  27. no_reviewer_emails = []
  28. file_path2 = ""
  29. if len(sys.argv) > 2:
  30. file_path2 = sys.argv[2]
  31. if os.path.isfile(file_path2):
  32. with open(file_path2) as fp2:
  33. for line in fp2:
  34. line = line.lower().strip('\n')
  35. no_reviewer_emails.append(line)
  36.  
  37. no_reviewer_emails = list(dict.fromkeys(no_reviewer_emails))
  38. no_reviewer_emails = sorted(no_reviewer_emails)
  39.  
  40. with open(file_path) as fp:
  41. cnt = 0
  42. lines = []
  43. for line in fp:
  44. line = line.lower().strip('\n')
  45. line = re.sub(r'(?is).*\(', "", line)
  46. line = line.replace(")", "").replace(" ", "").replace("รขย ","")
  47. # print("{}".format(line))
  48. # record_word_cnt(line.strip().split(' '), bag_of_words)
  49. lines.append(line)
  50. cnt += 1
  51.  
  52. total_left = list(dict.fromkeys(lines))
  53. print("Total {} ".format(cnt))
  54. print("Total left {}. Duplicates removed.".format(len(total_left)))
  55. total_left = sorted(total_left)
  56. # [x for x in a if x not in [2, 3, 7]]
  57. total_left = [x for x in total_left if x not in no_reviewer_emails]
  58. print("Total left {}. No Reviewers removed.".format(len(total_left)))
  59.  
  60. english_domain = ["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
  61. ".ie", ".com.sg", ".co.uk", ".com.au", "@icloud.com",
  62. "@outlook.com", "@protonmail.com", "@aol.com",
  63. "@ymail.com", "@live.com", "@mac.com", "@msn.com",
  64. "@yahoo.ca", "@me.com", "@me.com", "@googlemail.com",
  65. "@googlemail.com", "@yahoo.com.sg", "@yahoo.ie",
  66. "@btinternet.com", "@eircom.net", "@comcast.net"]
  67. english, total_left = get_email_list("English", english_domain, total_left)
  68.  
  69. languages = []
  70. chinese_domain = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
  71. "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
  72. chinese, total_left = get_email_list("Chinese", chinese_domain, total_left)
  73. languages.append(chinese)
  74.  
  75. korean_domain = [".kr", "@naver.com", "@hanmail.net", "@nate.com",
  76. "@nate.com", "@daum.net", "@korea.com", "@posco.com"]
  77. korean, total_left = get_email_list("Korean", korean_domain, total_left)
  78. languages.append(korean)
  79.  
  80. japanese_domain = [".jp", "@ab.wakwak.com", "@nifty.com"]
  81. japanese, total_left = get_email_list("Japanese", japanese_domain,
  82. total_left)
  83. languages.append(japanese)
  84.  
  85. german_domain = [".de", ".at", ".ch", "@gmx.net"]
  86. german, total_left = get_email_list("German", german_domain, total_left)
  87. languages.append(german)
  88.  
  89. france_domain = [".fr", ".nc", "@laposte.net", "@kedgebs.com"]
  90. france, total_left = get_email_list("France", france_domain, total_left)
  91. languages.append(france)
  92.  
  93. italian_domain = [".it"]
  94. italian, total_left = get_email_list("Italian", italian_domain, total_left)
  95. languages.append(italian)
  96.  
  97. russian_domain = [".ru", "@yandex.com", "@ukr.net"]
  98. russian, total_left = get_email_list("Russian", russian_domain, total_left)
  99. languages.append(russian)
  100.  
  101. portugese_domain = [".br", ".pt"]
  102. portugese, total_left = get_email_list("Portugese", portugese_domain,
  103. total_left)
  104. languages.append(portugese)
  105.  
  106. polish_domain = [".pl"]
  107. polish, total_left = get_email_list("Polish", polish_domain, total_left)
  108. languages.append(polish)
  109.  
  110. swedish_domain = [".se"]
  111. swedish, total_left = get_email_list("Swedish", swedish_domain, total_left)
  112. languages.append(swedish)
  113.  
  114. czech_domain = [".cz"]
  115. czech, total_left = get_email_list("Czech", czech_domain, total_left)
  116. languages.append(czech)
  117.  
  118. croatia_domain = [".hr"]
  119. croatia, total_left = get_email_list(
  120. "Croatia(Not available) ", croatia_domain, total_left)
  121. languages.append(croatia)
  122.  
  123. hebrew_domain = [".il"]
  124. hebrew, total_left = get_email_list("Hebrew", hebrew_domain, total_left)
  125. languages.append(hebrew)
  126.  
  127. indonesian_domain = [".id", "@mcreasindo.com"]
  128. indonesian, total_left = get_email_list("Indonesia", indonesian_domain,
  129. total_left)
  130. languages.append(indonesian)
  131.  
  132. dutch_domain = [".nl", "@chocoweb.com", "@vierbergen.net"]
  133. dutch, total_left = get_email_list("Dutch", dutch_domain, total_left)
  134. languages.append(dutch)
  135.  
  136. spanish_domain = [".cl", ".ar", ".es", "@mapp-oea.org"]
  137. spanish, total_left = get_email_list("Spanish", spanish_domain, total_left)
  138. languages.append(spanish)
  139.  
  140. education_domain = [".edu", ".edu.sg", ".edu.au"]
  141. education, total_left = get_email_list("Education", education_domain,
  142. total_left)
  143. # languages.append(education)
  144.  
  145. vietnamese_domain = [".vn"]
  146. vietnamese, total_left = get_email_list("Vietnamese", vietnamese_domain,
  147. total_left)
  148.  
  149. vietnamese_name = ["nguyen", "hoang", "ngoc", "phuong"]
  150. temp_lines = english[2].copy()
  151. for line in temp_lines:
  152. if any(substring in line for substring in vietnamese_name):
  153. english[2].remove(line)
  154. vietnamese[2].append(line)
  155.  
  156. english = (len(english[2]), english[1], english[2])
  157. vietnamese = (len(vietnamese[2]), vietnamese[1], vietnamese[2])
  158. print("Total left {} : {} {} : {} {} "
  159. .format(len(total_left), english[1], english[0], vietnamese[1],
  160. vietnamese[0]))
  161. languages.append(vietnamese)
  162.  
  163. temp_lines = total_left.copy()
  164. bad_email = []
  165. for line in temp_lines:
  166. if "@gmail" in line or "@aol" in line or "@" not in line \
  167. or re.search(".*\..$", line) or re.search(".*\..\..*", line):
  168. total_left.remove(line)
  169. bad_email.append(line)
  170.  
  171. print("Total left {} : bad {} ".format(len(total_left), len(bad_email)))
  172.  
  173. # print(*lines, sep='\n')
  174. # print("========================")
  175. # print(*bad_email, sep='\n')
  176. print("========================")
  177. languages = sorted(languages, key=lambda tup: tup[0], reverse=True)
  178. out_file_name = "tripadvisor_{}.txt" \
  179. .format(datetime.datetime.today().strftime('%Y_%m_%d'))
  180.  
  181. with open(out_file_name, "w") as of:
  182. cnt = 1
  183. for language in languages:
  184. of.write("\n\n==== {}. {} ====\n".format(cnt, language[1]))
  185. of.write("\n".join(language[2]))
  186. cnt = cnt + 1
  187.  
  188. if len(english[2]) < 1000:
  189. of.write("\n\n==== English ====\n")
  190. of.write("\n".join(english[2]))
  191. elif len(english[2]) < 2000:
  192. of.write("\n\n==== English 0-1000 ====\n")
  193. of.write("\n".join(english[2][0:1000]))
  194. of.write("\n\n==== English 1001- ====\n")
  195. of.write("\n".join(english[2][1001:]))
  196. elif len(english[2]) < 3000:
  197. of.write("\n\n==== English 0-1000 ====\n")
  198. of.write("\n".join(english[2][0:1000]))
  199. of.write("\n\n==== English 1001-2000 ====\n")
  200. of.write("\n".join(english[2][1001:2000]))
  201. of.write("\n\n==== English 2001- ====\n")
  202. of.write("\n".join(english[2][2001:]))
  203. else:
  204. of.write("\n\n==== English 0-1000 ====\n")
  205. of.write("\n".join(english[2][0:1000]))
  206. of.write("\n\n==== English 1001-2000 ====\n")
  207. of.write("\n".join(english[2][1001:2000]))
  208. of.write("\n\n==== English 2001-3000 ====\n")
  209. of.write("\n".join(english[2][2001:3000]))
  210. of.write("\n\n==== English 3001- ====\n")
  211. of.write("\n".join(english[2][3001:]))
  212. print("Contact IT. There are over 3000 emails!!!")
  213.  
  214. of.write("\n\n==== Leftover ====\n")
  215. of.write("\n".join(total_left))
  216. of.write("\n\n==== Bad email ====\n")
  217. of.write("\n".join(bad_email))
  218. of.write("\n\n")
  219. of.write(" ".join(["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
  220. ".ie", ".com.sg", ".co.uk", ".com.au", ".de", ".fr",
  221. ".hk", "co.jp", "@outlook.com", "@aol.com"]))
  222.  
  223. of.write("\n")
  224. of.write("\n\n==== Education ====\n")
  225. of.write("\n".join(education[2]))
  226.  
  227. # print(out_file_name)
  228. # print(datetime.datetime.today().strftime('%Y_%m_%d'))
  229. # print(*english[2], sep='\n')
  230. # sorted_words = order_bag_of_words(bag_of_words, desc=True)
  231. # print("Most frequent 10 words {}".format(sorted_words[:10]))
  232.  
  233.  
  234. if __name__ == '__main__':
  235. main()
  236.  
  237. # domain_list = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
  238. # "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement