Advertisement
Guest User

Untitled

a guest
Nov 14th, 2019
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.97 KB | None | 0 0
  1. from more_itertools import unique_everseen
  2.  
  3. import requests, json
  4. import csv
  5. import os
  6. import glob
  7. import pandas as pd
  8.  
  9. def isMale(name):
  10.  
  11. with open('male_names1.csv','r') as maleNames:
  12.  
  13. checkList = csv.reader(maleNames, delimiter=',')
  14.  
  15. for row in checkList:
  16.  
  17. if row[0].lower() == name.lower():
  18. return 1
  19.  
  20. return 0
  21.  
  22. def excludeBio(line):
  23.  
  24. excludedWords = ["specialty", "crafted", "supplies", "domestic", "website", "order", "orders ", "$", "£", "free",
  25. "world", "worldwide", "wide", "ship", "shipping", "jewelry", "men", "men's", "guy", "guy's",
  26. "business", "days", "return", "returns", "exchange", "exchanges"]
  27.  
  28. line = line.replace('\n', '')
  29. line = line.replace('\t', '')
  30.  
  31. lineList = line.split(" ")
  32.  
  33. valsLower = [item.lower() for item in lineList]
  34.  
  35. l3 = [x for x in valsLower if x not in excludedWords]
  36.  
  37. if len(valsLower) != len(l3):
  38. print(line)
  39. return 1
  40. else:
  41. return 0
  42.  
  43. def cleanWithFilters(row):
  44.  
  45.  
  46. excludeCategoryList = ["Non-Profits & Religious Organizations", "Content & Apps", "Local Business",
  47. "Transportation & Accomodation Services", "Grocery & Convenience Stores", "Auto Dealers",
  48. "Business & Utility Services", "Product/Service", "Photographer", "Movie Theater"]
  49.  
  50. isOK = 1
  51.  
  52. try:
  53. if int(row[5]) > 3000:
  54. print("P1")
  55. isOK = 0
  56. if int(row[4]) < 30:
  57. print("P2")
  58. isOK = 0
  59. if excludeBio(row[7]):
  60. print("P3")
  61. if row[18] in excludeCategoryList:
  62. print("P4")
  63. isOK = 0
  64. if int(row[6]) < 1:
  65. print("P5")
  66. isOK = 0
  67. if not row[10]:
  68. print("P6")
  69. isOK = 0
  70. if isMale(row[2].split(' ')[0]):
  71. print(row[2].split(' ')[0])
  72. isOK = 0
  73. if checkEmailPattern(row[10]):
  74. print("P7")
  75. isOK = 0
  76.  
  77. except:
  78. print("except")
  79. isOK = 0
  80.  
  81. return isOK
  82.  
  83.  
  84. def checkEmailPattern(email):
  85.  
  86. listOfUnwantedEmails = ["hi","info","aloha","service","customerservice","hello", "help", "photo", "photographer", "shop",
  87. "customer","contact"]
  88.  
  89. emailBegins = email.split('@')[0]
  90.  
  91. if emailBegins.lower() in listOfUnwantedEmails:
  92. return 1
  93. else:
  94. return 0
  95.  
  96.  
  97.  
  98. def matchEmails(email):
  99.  
  100. with open('mejlovizaproverka.csv','r') as validEmailList:
  101. emailList = csv.reader(validEmailList, delimiter=',')
  102. for row in emailList:
  103. emailContains = 0
  104.  
  105. if row[0].lower() == email.lower():
  106. emailContains = 1
  107. break
  108.  
  109. if emailContains == 1:
  110. return 1
  111. else:
  112. return 0
  113.  
  114.  
  115. #call when nothing is done
  116. def mainFunc():
  117.  
  118. with open('amy_hetherington_followers_PR6440_1878530020.csv') as csvfile, open('amy_hetherington.csv','w+') as outputfile:
  119. readCSV = csv.reader(csvfile, delimiter=',')
  120. writer = csv.writer(outputfile, delimiter=",")
  121. writer.writerow(next(readCSV))
  122.  
  123. for row in readCSV:
  124.  
  125. if cleanWithFilters(row):
  126. writer.writerow(row)
  127.  
  128.  
  129. #call after the output file is done
  130. def mergeToMainCsv():
  131.  
  132. os.chdir("/Users/nikolasokolov/PycharmProjects/untitled/allbatches")
  133.  
  134. extension = 'csv'
  135.  
  136. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  137.  
  138. # combine all files in the list
  139. combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
  140. # export to csv
  141. combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig')
  142.  
  143. #remove duplicates from all previous batches compared to this one
  144. def removeDuplicates():
  145.  
  146. data = pd.read_csv("/Users/nikolasokolov/PycharmProjects/untitled/allbatches/combined_csv.csv")
  147.  
  148. df = pd.DataFrame(data)
  149.  
  150. emailList = df['email'].unique().tolist()
  151.  
  152. with open('50k.csv') as csvfile, open('50k - FINAL.csv','w+') as outputfile:
  153. readCSV = csv.reader(csvfile, delimiter=',')
  154. writer = csv.writer(outputfile, delimiter=",")
  155. writer.writerow(next(readCSV))
  156.  
  157. for row in readCSV:
  158.  
  159. if row[10] in emailList:
  160. print("Duplicate " + row[10])
  161. continue
  162. writer.writerow(row)
  163.  
  164. def addScrapedColumn():
  165.  
  166. df = pd.read_csv("stanzie.star_followers_FT433_794983088.csv", error_bad_lines=False)
  167.  
  168. df['scrapedFrom'] = "stanzie.star"
  169.  
  170. print(df)
  171.  
  172. df.to_csv('stanzie.star_followers_FT433_794983088.csv')
  173.  
  174. #redot na vikanje funkcii e:
  175.  
  176. #mainFunc() kaj shto ke go smenish file-nameot so toj shto treba da se proveri
  177. #od kako ke gi spoish u sheets i trgnesh duplicates od spoenite
  178. #downloadiraj go batchot shto e spremen i staj go u allbatches
  179. #povikaj removeDuplicates za batchot
  180. #validiraj i prakjaj
  181.  
  182.  
  183. mainFunc()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement