Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import os
- import re
- import datetime
- def get_email_list(name, domain_list, total_left):
- emails_left = total_left.copy()
- email_list = []
- for email in emails_left:
- if any(email.endswith(substring) for substring in domain_list):
- total_left.remove(email)
- email_list.append(email)
- language = (len(email_list), name, email_list)
- print("Total left {} : {} {} ".format(len(total_left), language[1], language[0]))
- return language, total_left
- def main():
- file_path = sys.argv[1]
- if not os.path.isfile(file_path):
- print("File path {} Program Exiting..".format(
- file_path))
- sys.exit()
- no_reviewer_emails = []
- file_path2 = ""
- if len(sys.argv) > 2:
- file_path2 = sys.argv[2]
- if os.path.isfile(file_path2):
- with open(file_path2) as fp2:
- for line in fp2:
- line = line.lower().strip('\n')
- no_reviewer_emails.append(line)
- no_reviewer_emails = list(dict.fromkeys(no_reviewer_emails))
- no_reviewer_emails = sorted(no_reviewer_emails)
- with open(file_path) as fp:
- cnt = 0
- lines = []
- for line in fp:
- line = line.lower().strip('\n')
- line = re.sub(r'(?is).*\(', "", line)
- line = line.replace(")", "").replace(" ", "").replace("รขย ","")
- # print("{}".format(line))
- # record_word_cnt(line.strip().split(' '), bag_of_words)
- lines.append(line)
- cnt += 1
- total_left = list(dict.fromkeys(lines))
- print("Total {} ".format(cnt))
- print("Total left {}. Duplicates removed.".format(len(total_left)))
- total_left = sorted(total_left)
- # [x for x in a if x not in [2, 3, 7]]
- total_left = [x for x in total_left if x not in no_reviewer_emails]
- print("Total left {}. No Reviewers removed.".format(len(total_left)))
- english_domain = ["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
- ".ie", ".com.sg", ".co.uk", ".com.au", "@icloud.com",
- "@outlook.com", "@protonmail.com", "@aol.com",
- "@ymail.com", "@live.com", "@mac.com", "@msn.com",
- "@yahoo.ca", "@me.com", "@me.com", "@googlemail.com",
- "@googlemail.com", "@yahoo.com.sg", "@yahoo.ie",
- "@btinternet.com", "@eircom.net", "@comcast.net"]
- english, total_left = get_email_list("English", english_domain, total_left)
- languages = []
- chinese_domain = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
- "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
- chinese, total_left = get_email_list("Chinese", chinese_domain, total_left)
- languages.append(chinese)
- korean_domain = [".kr", "@naver.com", "@hanmail.net", "@nate.com",
- "@nate.com", "@daum.net", "@korea.com", "@posco.com"]
- korean, total_left = get_email_list("Korean", korean_domain, total_left)
- languages.append(korean)
- japanese_domain = [".jp", "@ab.wakwak.com", "@nifty.com"]
- japanese, total_left = get_email_list("Japanese", japanese_domain,
- total_left)
- languages.append(japanese)
- german_domain = [".de", ".at", ".ch", "@gmx.net"]
- german, total_left = get_email_list("German", german_domain, total_left)
- languages.append(german)
- france_domain = [".fr", ".nc", "@laposte.net", "@kedgebs.com"]
- france, total_left = get_email_list("France", france_domain, total_left)
- languages.append(france)
- italian_domain = [".it"]
- italian, total_left = get_email_list("Italian", italian_domain, total_left)
- languages.append(italian)
- russian_domain = [".ru", "@yandex.com", "@ukr.net"]
- russian, total_left = get_email_list("Russian", russian_domain, total_left)
- languages.append(russian)
- portugese_domain = [".br", ".pt"]
- portugese, total_left = get_email_list("Portugese", portugese_domain,
- total_left)
- languages.append(portugese)
- polish_domain = [".pl"]
- polish, total_left = get_email_list("Polish", polish_domain, total_left)
- languages.append(polish)
- swedish_domain = [".se"]
- swedish, total_left = get_email_list("Swedish", swedish_domain, total_left)
- languages.append(swedish)
- czech_domain = [".cz"]
- czech, total_left = get_email_list("Czech", czech_domain, total_left)
- languages.append(czech)
- croatia_domain = [".hr"]
- croatia, total_left = get_email_list(
- "Croatia(Not available) ", croatia_domain, total_left)
- languages.append(croatia)
- hebrew_domain = [".il"]
- hebrew, total_left = get_email_list("Hebrew", hebrew_domain, total_left)
- languages.append(hebrew)
- indonesian_domain = [".id", "@mcreasindo.com"]
- indonesian, total_left = get_email_list("Indonesia", indonesian_domain,
- total_left)
- languages.append(indonesian)
- dutch_domain = [".nl", "@chocoweb.com", "@vierbergen.net"]
- dutch, total_left = get_email_list("Dutch", dutch_domain, total_left)
- languages.append(dutch)
- spanish_domain = [".cl", ".ar", ".es", "@mapp-oea.org"]
- spanish, total_left = get_email_list("Spanish", spanish_domain, total_left)
- languages.append(spanish)
- education_domain = [".edu", ".edu.sg", ".edu.au"]
- education, total_left = get_email_list("Education", education_domain,
- total_left)
- # languages.append(education)
- vietnamese_domain = [".vn"]
- vietnamese, total_left = get_email_list("Vietnamese", vietnamese_domain,
- total_left)
- vietnamese_name = ["nguyen", "hoang", "ngoc", "phuong"]
- temp_lines = english[2].copy()
- for line in temp_lines:
- if any(substring in line for substring in vietnamese_name):
- english[2].remove(line)
- vietnamese[2].append(line)
- english = (len(english[2]), english[1], english[2])
- vietnamese = (len(vietnamese[2]), vietnamese[1], vietnamese[2])
- print("Total left {} : {} {} : {} {} "
- .format(len(total_left), english[1], english[0], vietnamese[1],
- vietnamese[0]))
- languages.append(vietnamese)
- temp_lines = total_left.copy()
- bad_email = []
- for line in temp_lines:
- if "@gmail" in line or "@aol" in line or "@" not in line \
- or re.search(".*\..$", line) or re.search(".*\..\..*", line):
- total_left.remove(line)
- bad_email.append(line)
- print("Total left {} : bad {} ".format(len(total_left), len(bad_email)))
- # print(*lines, sep='\n')
- # print("========================")
- # print(*bad_email, sep='\n')
- print("========================")
- languages = sorted(languages, key=lambda tup: tup[0], reverse=True)
- out_file_name = "tripadvisor_{}.txt" \
- .format(datetime.datetime.today().strftime('%Y_%m_%d'))
- with open(out_file_name, "w") as of:
- cnt = 1
- for language in languages:
- of.write("\n\n==== {}. {} ====\n".format(cnt, language[1]))
- of.write("\n".join(language[2]))
- cnt = cnt + 1
- if len(english[2]) < 1000:
- of.write("\n\n==== English ====\n")
- of.write("\n".join(english[2]))
- elif len(english[2]) < 2000:
- of.write("\n\n==== English 0-1000 ====\n")
- of.write("\n".join(english[2][0:1000]))
- of.write("\n\n==== English 1001- ====\n")
- of.write("\n".join(english[2][1001:]))
- elif len(english[2]) < 3000:
- of.write("\n\n==== English 0-1000 ====\n")
- of.write("\n".join(english[2][0:1000]))
- of.write("\n\n==== English 1001-2000 ====\n")
- of.write("\n".join(english[2][1001:2000]))
- of.write("\n\n==== English 2001- ====\n")
- of.write("\n".join(english[2][2001:]))
- else:
- of.write("\n\n==== English 0-1000 ====\n")
- of.write("\n".join(english[2][0:1000]))
- of.write("\n\n==== English 1001-2000 ====\n")
- of.write("\n".join(english[2][1001:2000]))
- of.write("\n\n==== English 2001-3000 ====\n")
- of.write("\n".join(english[2][2001:3000]))
- of.write("\n\n==== English 3001- ====\n")
- of.write("\n".join(english[2][3001:]))
- print("Contact IT. There are over 3000 emails!!!")
- of.write("\n\n==== Leftover ====\n")
- of.write("\n".join(total_left))
- of.write("\n\n==== Bad email ====\n")
- of.write("\n".join(bad_email))
- of.write("\n\n")
- of.write(" ".join(["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
- ".ie", ".com.sg", ".co.uk", ".com.au", ".de", ".fr",
- ".hk", "co.jp", "@outlook.com", "@aol.com"]))
- of.write("\n")
- of.write("\n\n==== Education ====\n")
- of.write("\n".join(education[2]))
- # print(out_file_name)
- # print(datetime.datetime.today().strftime('%Y_%m_%d'))
- # print(*english[2], sep='\n')
- # sorted_words = order_bag_of_words(bag_of_words, desc=True)
- # print("Most frequent 10 words {}".format(sorted_words[:10]))
- if __name__ == '__main__':
- main()
- # domain_list = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
- # "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement