Advertisement
Guest User

Python_Script

a guest
Jan 27th, 2020
1,060
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 40.42 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. # 1) addScrapedColumn() & also specify datatypes for all columns
  3. # 2) combine all files in one with mergeToMainCsv()
  4. # 3) remove all duplicate emails & empty rows from the sheet with removeDuplicatesInSheet()
  5. # 4) remove all duplicate emails from previous batches removeDuplicates() - skip if new client
  6. # 5) filter the emails mainFunc()
  7. # 6) add emails to previous batches sheet mergeAllBatches() - skip if new client
  8. # VALIDATE
  9. # 7) remove all columns from NewBatch except 'email' and upload to debounce
  10. # 8) merge validated emails with batch matchEmailsAfterDeBounce()
  11. # 9) upload google sheet to client
  12. # -*- coding: utf-8 -*-
  13.  
  14. import requests, json
  15. import datetime
  16. import csv
  17. import os
  18. import glob
  19. import pandas as pd
  20. import sys
  21. import http.client
  22. import re
  23. import sys
  24. import numpy as np
  25. import random
  26.  
  27.  
  28. # male names
  29. def isMale(name):
  30. data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\MaleNames\male_names1.csv",
  31. encoding="ISO-8859-1")
  32.  
  33. df = pd.DataFrame(data)
  34.  
  35. maleNames = df['boyNames'].unique().tolist()
  36.  
  37. if name.lower() in maleNames:
  38. return 1
  39. return 0
  40.  
  41.  
  42. def doesTheEmailStartWithMaleName(email):
  43. data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\MaleNames\male_names1.csv",
  44. encoding="ISO-8859-1")
  45.  
  46. df = pd.DataFrame(data)
  47.  
  48. maleNames = df['boyNames'].unique().tolist()
  49.  
  50. for maleName in maleNames:
  51.  
  52. if len(maleName) < 4 or len(email) < 4:
  53. continue
  54.  
  55. if email.lower().startswith(maleName.lower(), 0, len(maleName)):
  56. # print("BOY NAME FOUND!!, " + maleName + ", a mejlot e ," + email)
  57. return 1
  58.  
  59. return 0
  60.  
  61.  
  62. # merge current batch to all previous batches
  63. def mergeAllBatches():
  64. os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\AllBatches")
  65. extension = 'csv'
  66. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  67. # combine all files in the list
  68. combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
  69. # export to csv
  70. combined_csv.to_csv("combined_csv.csv", index=False, encoding="utf-8")
  71.  
  72.  
  73. # match valid emails from debounce
  74. def matchEmails(email):
  75. with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\emails.csv", 'r') as validEmailList:
  76. emailList = csv.reader(validEmailList, delimiter=',')
  77. for row in emailList:
  78. emailContains = 0
  79.  
  80. if row[0].lower() == email.lower():
  81. emailContains = 1
  82. break
  83. if emailContains == 1:
  84. return 1
  85. else:
  86. return 0
  87.  
  88.  
  89. # NewBatch - current batch
  90. def matchEmailsAfterDeBounce():
  91. csv.field_size_limit(200000000)
  92. with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\combined_nodup.csv", encoding="utf-8-sig",
  93. errors='ignore') as csvfile, \
  94. open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\NewBatch_UPLOAD.csv", 'w+',
  95. encoding="utf-8-sig", newline="") as outputfile:
  96. print("Otvoreni files")
  97. readCSV = csv.reader(csvfile, delimiter=',')
  98. writer = csv.writer(outputfile, delimiter=',')
  99. writer.writerow(next(readCSV))
  100.  
  101. global i
  102. i = 0
  103. for row in readCSV:
  104. if matchEmails(row[9]):
  105. writer.writerow(row)
  106. i = i + 1
  107. print("TOTAL EMAIL IS %d", i)
  108.  
  109.  
  110. # called in cleanWithFilters()
  111. def checkEmailPattern(email):
  112. listOfUnwantedEmails = ['2015', '2016', 'noreply', '2017', '2018', '2019', '2020', 'abuse', 'academy',
  113. 'accessibility', 'account', 'accountant', 'accounting', 'accountmanager', 'accountmanagers',
  114. 'accounts', 'accountspayable', 'acquisition', 'admin', 'admin1', 'administracao',
  115. 'administracion', 'administrador', 'administratie', 'administratif', 'administration',
  116. 'administrativo', 'administrator', 'administrators', 'admins', 'adminteam', 'admissions',
  117. 'adops', 'ads', 'adventure', 'advertise', 'advertising', 'advertisingsales', 'advice',
  118. 'advisor', 'advisors', 'adwords', 'affiliate', 'affiliates', 'agence', 'agencia', 'agency',
  119. 'agents', 'alarm', 'alarms', 'alert', 'alerts', 'alexa', 'all', 'all-employees', 'all-pms',
  120. 'all-staff', 'all-team', 'all-users', 'all.employees', 'all.staff', 'all.users',
  121. 'all_staff', 'alla', 'alle', 'allemployees', 'allhands', 'allsales', 'allstaff',
  122. 'allstudents', 'allteachers', 'allteam', 'allusers', 'alpha', 'alphas', 'alumni',
  123. 'ambassadors', 'amministrazione', 'analysts', 'analytics', 'android', 'angels', 'animation',
  124. 'announce', 'announcements', 'ap', 'api', 'app', 'apple', 'application', 'applications',
  125. 'apply', 'appointments', 'apps', 'archives', 'asistente', 'asset', 'assistanthead',
  126. 'assistencia', 'assistenza', 'associates', 'associates-all', 'ateam', 'atencionalcliente',
  127. 'atendimento', 'auctions', 'available', 'backend', 'backend-dev', 'backup', 'bd',
  128. 'benefits', 'berlin', 'bestellung', 'beta', 'biblioteca', 'bibliotheque', 'billing',
  129. 'bills', 'biuro', 'biz', 'bizdev', 'blog', 'board', 'bod', 'bookclub', 'booking',
  130. 'bookings', 'boston', 'boxoffice', 'brand', 'branding', 'brands', 'brandsolutions',
  131. 'broadcast', 'buchhaltung', 'bugs', 'build', 'bursar', 'busdev', 'business',
  132. 'business_team', 'businessdevelopment', 'ca', 'caltrain', 'campaign', 'campaigns',
  133. 'campusteam', 'capacitacion', 'captain', 'captains', 'care', 'career', 'careers',
  134. 'catering', 'central', 'centro', 'ceo', 'ceos', 'channel-sales', 'chat', 'chatter', 'chef',
  135. 'chicago', 'china', 'citymanagers', 'classof2016', 'classof2017', 'classof2018',
  136. 'classof2019', 'classroom_teachers', 'client', 'clientes', 'clients', 'clientservices',
  137. 'clinic', 'cloud', 'cm', 'co-op', 'coach', 'coaches', 'coaching', 'code', 'colaboradores',
  138. 'colegio', 'com', 'comenzi', 'comercial', 'comercial1', 'comercial2', 'comments',
  139. 'commercial', 'commerciale', 'commissions', 'committee', 'comms', 'communication',
  140. 'communications', 'community', 'community', 'company', 'company.wide', 'compete',
  141. 'competition', 'compliance', 'compras', 'compta', 'comptabilite', 'comunicacao',
  142. 'comunicacion', 'comunicaciones', 'comunicazione', 'concierge', 'conference', 'connect',
  143. 'consultant', 'consultas', 'consulting', 'consultoria', 'contabil', 'contabilidad',
  144. 'contabilidade', 'contabilita', 'contact', 'contactenos', 'contacto', 'contactus',
  145. 'contador', 'contato', 'content', 'contractor', 'contractors', 'contracts', 'controller',
  146. 'coordinator', 'copyright', 'core', 'coreteam', 'corp', 'corporate', 'corporatesales',
  147. 'council', 'courrier', 'creative', 'crew', 'crm', 'cs', 'csm', 'csteam', 'cultura',
  148. 'culture', 'customer', 'customer.service', 'customercare', 'customerfeedback', 'customers',
  149. 'customerservice', 'customerservicecenter', 'customerservices', 'customersuccess',
  150. 'customersupport', 'custserv', 'daemon', 'data', 'database', 'deals', 'dean', 'delivery',
  151. 'demo', 'denver', 'departures', 'deploy', 'deputy', 'deputyhead', 'design', 'designer',
  152. 'designers', 'dev', 'developer', 'developers', 'development', 'devnull', 'devops', 'devs',
  153. 'devteam', 'digital', 'digsitesvalue', 'direccion', 'direction', 'director', 'directors',
  154. 'directory', 'diretoria', 'direzione', 'discuss', 'dispatch', 'diversity', 'dns', 'docs',
  155. 'domain', 'domainmanagement', 'domains', 'donations', 'donors', 'download', 'dreamteam',
  156. 'ecommerce', 'editor', 'editorial', 'editors', 'education', 'einkauf', 'email', 'emergency',
  157. 'employee', 'employees', 'employment', 'eng', 'eng-all', 'engagement', 'engineering',
  158. 'engineers', 'english', 'enq', 'enquire', 'enquires', 'enquiries', 'enquiry', 'enrollment',
  159. 'enterprise', 'equipe', 'equipo', 'error', 'errors', 'escritorio', 'europe', 'event',
  160. 'events', 'everybody', 'everyone', 'exec', 'execs', 'execteam', 'executive', 'executives',
  161. 'expenses', 'expert', 'experts', 'export', 'facilities', 'facturacion', 'faculty', 'family',
  162. 'farmacia', 'faturamento', 'fax', 'fbl', 'feedback', 'fellows', 'finance', 'financeiro',
  163. 'financeiro2', 'finanzas', 'firmapost', 'fiscal', 'food', 'football', 'founders', 'france',
  164. 'franchise', 'friends', 'frontdesk', 'frontend', 'frontoffice', 'fte', 'ftp', 'fulltime',
  165. 'fun', 'fundraising', 'gardner', 'geeks', 'general', 'geral', 'giving', 'global', 'grants',
  166. 'graphics', 'group', 'growth', 'hackathon', 'hackers', 'head', 'head.office', 'headoffice',
  167. 'heads', 'headteacher', 'hello', 'help', 'helpdesk', 'hi', 'highschool', 'hiring', 'hola',
  168. 'home', 'homes', 'hosting', 'hostmaster', 'hotel', 'house', 'hq', 'hr', 'hrdept', 'hsstaff',
  169. 'hsteachers', 'humanresources', 'ideas', 'implementation', 'import', 'inbound', 'inbox',
  170. 'india', 'info', 'infor', 'informacion', 'informatica', 'information', 'informatique',
  171. 'informativo', 'infra', 'infrastructure', 'ingenieria', 'innovation', 'inoc', 'inquiries',
  172. 'inquiry', 'insidesales', 'insights', 'instagram', 'insurance', 'integration',
  173. 'integrations', 'intern', 'internal', 'international', 'internet', 'interns', 'internship',
  174. 'invest', 'investment', 'investor', 'investorrelations', 'investors', 'invoice', 'invoices',
  175. 'invoicing', 'ios', 'iphone', 'ir', 'ispfeedback', 'ispsupport', 'it', 'ithelp',
  176. 'itsupport', 'itunes', 'jira', 'job', 'jobs', 'join', 'jornalismo', 'junk', 'kontakt',
  177. 'kundeservice', 'la', 'lab', 'laboratorio', 'labs', 'ladies', 'latam', 'launch', 'lead',
  178. 'leaders', 'leadership', 'leadership-team', 'leadershipteam', 'leads', 'leasing', 'legal',
  179. 'letters', 'library', 'licensing', 'links', 'list', 'list-request', 'login', 'logistica',
  180. 'logistics', 'logistiek', 'lt', 'lunch', 'mail', 'mailbox', 'maildaemon', 'mailer-daemon',
  181. 'mailerdaemon', 'mailing', 'maintenance', 'management', 'management-group',
  182. 'management.team', 'management_team', 'manager', 'managers', 'marketing', 'marketing-ops',
  183. 'marketing-team', 'marketingteam', 'marketplace', 'master', 'mayor', 'md', 'media',
  184. 'meetup', 'member', 'members', 'membership', 'mentors', 'metrics', 'mgmt', 'middleschool',
  185. 'misc', 'mkt', 'mktg', 'mobile', 'monitor', 'monitoring', 'montreal', 'msstaff',
  186. 'msteachers', 'mt', 'music', 'network', 'newbiz', 'newbusiness', 'news', 'newsletter',
  187. 'newyork', 'nntp', 'no-reply', 'no.replay', 'no.reply', 'nobody', 'noc', 'none', 'noreply',
  188. 'noresponse', 'northamerica', 'nospam', 'notes', 'notifications', 'notify', 'nps', 'null',
  189. 'ny', 'nyc', 'nyoffice', 'offboarding', 'offers', 'office', 'officeadmin', 'officemanager',
  190. 'officers', 'officestaff', 'offtopic', 'oficina', 'onboarding', 'online', 'onsite', 'ooo',
  191. 'operaciones', 'operations', 'ops', 'order', 'orders', 'ordini', 'outage', 'outreach',
  192. 'owners', 'parents', 'paris', 'partner', 'partners', 'partnerships', 'parts', 'pay',
  193. 'payment', 'payments', 'paypal', 'payroll', 'pd', 'people', 'peoplemanagers', 'peopleops',
  194. 'performance', 'personnel', 'phish', 'phishing', 'photos', 'planning', 'platform', 'pm',
  195. 'portfolio', 'post', 'postbox', 'postfix', 'postmaster', 'ppc', 'pr', 'prefeitura',
  196. 'presales', 'presidencia', 'president', 'presidente', 'press', 'presse', 'prime',
  197. 'principal', 'principals', 'privacy', 'procurement', 'prod', 'produccion', 'product',
  198. 'product-team', 'product.growth', 'product.management', 'product.managers', 'product.team',
  199. 'production', 'productmanagers', 'products', 'productteam', 'produto', 'program',
  200. 'programs', 'project', 'projectmanagers', 'projects', 'promo', 'promotions', 'protocollo',
  201. 'proveedores', 'publicidade', 'publisher', 'publishers', 'purchase', 'purchases',
  202. 'purchasing', 'qa', 'qualidade', 'questions', 'quotes', 'random', 'realestate', 'receipts',
  203. 'recepcion', 'reception', 'receptionist', 'recruit', 'recruiter', 'recruiters',
  204. 'recruiting', 'recruitment', 'recrutement', 'recursoshumanos', 'redacao', 'redaccion',
  205. 'redaction', 'redazione', 'referrals', 'register', 'registrar', 'registration',
  206. 'relacionamento', 'release', 'releases', 'remote', 'remove', 'rentals', 'report',
  207. 'reporting', 'reports', 'request', 'requests', 'research', 'reservaciones', 'reservas',
  208. 'reservation', 'reservations', 'residents', 'response', 'restaurant', 'resume', 'resumes',
  209. 'retail', 'returns', 'revenue', 'rezervari', 'rfp', 'rnd', 'rockstars', 'root', 'rrhh',
  210. 'rsvp', 'sales', 'sales-team', 'sales.team', 'sales1', 'sales2', 'salesengineers',
  211. 'salesforce', 'salesops', 'salesteam', 'sanfrancisco', 'school', 'schooloffice', 'science',
  212. 'sdr', 'se', 'search', 'seattle', 'secretaria', 'secretariaat', 'secretaris', 'secretary',
  213. 'security', 'sekretariat', 'sem', 'seniors', 'seo', 'server', 'service', 'serviceclient',
  214. 'servicedesk', 'services', 'servicioalcliente', 'sf', 'sf-office', 'sfo', 'sfoffice',
  215. 'sfteam', 'shareholders', 'shipping', 'shop', 'shopify', 'shopping', 'signup', 'signups',
  216. 'singapore', 'sistemas', 'site', 'smtp', 'social', 'socialclub', 'socialmedia', 'socios',
  217. 'software', 'solutions', 'soporte', 'sos', 'spam', 'sponsorship', 'sport', 'squad', 'staff',
  218. 'startups', 'stats', 'stockholm', 'store', 'stories', 'strategy', 'stripe', 'student',
  219. 'students', 'studio', 'submissions', 'submit', 'subscribe', 'subscriptions', 'success',
  220. 'suggestions', 'supervisor', 'supervisors', 'suporte', 'supply', 'support', 'support-team',
  221. 'supportteam', 'suprimentos', 'sydney', 'sysadmin', 'system', 'systems', 'ta', 'talent',
  222. 'tax', 'teachers', 'team', 'teamleaders', 'teamleads', 'tech', 'technical', 'technik',
  223. 'technology', 'techops', 'techsupport', 'techteam', 'tecnologia', 'tesoreria', 'test',
  224. 'testgroup', 'testing', 'the.principal', 'theoffice', 'theteam', 'tickets', 'time',
  225. 'timesheets', 'todos', 'tools', 'tour', 'trade', 'trainers', 'training', 'transport',
  226. 'travel', 'treasurer', 'tribe', 'trustees', 'turismo', 'twitter', 'uk',
  227. 'undisclosed-recipients', 'unsubscribe', 'update', 'updates', 'us', 'usa', 'usenet', 'user',
  228. 'users', 'usteam', 'uucp', 'ux', 'vendas', 'vendas1', 'vendas2', 'vendor', 'vendors',
  229. 'ventas', 'ventas1', 'ventas2', 'verkauf', 'verwaltung', 'video', 'vip', 'voicemail',
  230. 'volunteer', 'volunteering', 'volunteers', 'vorstand', 'warehouse', 'watercooler', 'web',
  231. 'webadmin', 'webdesign', 'webdev', 'webinars', 'webmaster', 'website', 'webteam', 'welcome',
  232. 'whois', 'wholesale', 'women', 'wordpress', 'work', 'workshop', 'writers', 'www',
  233. 'zentrale']
  234. listOfUnwantedProviders = ['ru', 'bg']
  235. emailBegins = email.split('@')[0]
  236. emailEnds = email.split('.')[-1]
  237. if '@' not in email:
  238. return 1
  239.  
  240. if doesTheEmailStartWithMaleName(emailBegins):
  241. print("Email boy name " + email)
  242. return 1
  243.  
  244. if emailEnds.lower() in listOfUnwantedProviders:
  245. return 1
  246.  
  247. if emailBegins.lower() not in listOfUnwantedEmails:
  248. return 0
  249. else:
  250. return 1
  251.  
  252.  
  253. # called in cleanWithFilters()
  254. def excludeBio(line):
  255. excludedWords = ["specialty", "crafted", "supplies", "domestic", "website", "order", "orders", "$", "£", "free",
  256. "worldwide", "wide", "ship", "shipping", "jewelry", "men", "men's", "guy", "guy's", "business",
  257. "days",
  258. "return", "returns", "exchange", "exchanges", "make up", "MUA", "mua", "make ", "photography",
  259. "ethical",
  260. "hand", "made", "handmade", "hand made", "knit", "knitwear", "beauty", "lux", "luxury", "designed",
  261. "designed in",
  262. "to get featured", "tag", "apparel", "sustainable", "made in", "made", "online boutique",
  263. "boutique", "online", "photographer", "photography"]
  264.  
  265. line = line.replace('\n', '')
  266. line = line.replace('\t', '')
  267. lineList = line.split(" ")
  268.  
  269. valsLower = [item.lower() for item in lineList]
  270.  
  271. l3 = [x for x in valsLower if x not in excludedWords]
  272.  
  273. if len(valsLower) != len(l3):
  274. return 1
  275. else:
  276. return 0
  277.  
  278.  
  279. # called in mainFunc()
  280. def cleanWithFilters(row):
  281. isOK = 1
  282. try:
  283. if not row[10]: # nema email
  284. print("No email")
  285. isOK = 0
  286. # elif row[7].strip() == "" or int(row[7].split(".")[0]) > 3500: # following count
  287. # print("Following " + row[1] + " " + row[7])
  288. # isOK = 0
  289. # elif (row[8].strip() == "" or int(row[8].split(".")[0]) < 30) or int(
  290. # row[8].split(".")[0]) > 50000: # followers
  291. # print("Followers " + row[1] + " " + row[8])
  292. # isOK = 0
  293. # elif row[2].strip() == "" or int(row[2].split(".")[0]) < 1: # media count
  294. # print("Media count " + row[2].strip())
  295. # isOK = 0
  296. elif checkEmailPattern(row[10]): # email pattern-ot ne e dobar
  297. print("Email pattern " + row[10])
  298. isOK = 0
  299. except:
  300. isOK = 0
  301. pass
  302.  
  303. try:
  304.  
  305. if isMale(row[6].split(' ')[0]):
  306. print("MALE " + row[6].split(" ")[0])
  307. isOK = 0
  308.  
  309. except:
  310. pass
  311.  
  312. return isOK
  313.  
  314.  
  315. # filter the data based on the client's filters
  316. def mainFunc():
  317. csv.field_size_limit(2000000000)
  318. # sys.stdout = open("goat.csv", "w")
  319. with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\NewBatch.csv", encoding="utf-8-sig",
  320. errors='ignore') as csvfile, \
  321. open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\FilteredBatch.csv", 'w+', newline="",
  322. encoding="utf-8-sig") as outputfile:
  323. readCSV = csv.reader(csvfile, delimiter=',')
  324. writer = csv.writer(outputfile, delimiter=",")
  325. writer.writerow(next(readCSV))
  326. global i
  327. i = 0
  328. for row in readCSV:
  329. if cleanWithFilters(row):
  330. writer.writerow(row)
  331. i = i + 1
  332. print("TOTAL EMAIL IS %d", i)
  333.  
  334.  
  335. # remove duplicates from all previous batches compared to this one
  336. def removeAllDuplicates():
  337. csv.field_size_limit(200000000)
  338. data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\AllBatches\combined_csv.csv")
  339. df = pd.DataFrame(data)
  340. emailList = df['email'].unique().tolist()
  341.  
  342. with open('NewBatch.csv', 'r', encoding="utf-8-sig") as csvfile, \
  343. open('FilteredBatch.csv', 'w+', newline="", encoding="utf-8-sig") as outputfile:
  344.  
  345. csv.field_size_limit(200000000)
  346. readCSV = csv.reader(csvfile, delimiter=',')
  347. writer = csv.writer(outputfile, delimiter=",")
  348. writer.writerow(next(readCSV))
  349.  
  350. for row in readCSV:
  351. try:
  352. if row[8] in emailList:
  353. print("Duplicate " + row[8])
  354. continue
  355. writer.writerow(row)
  356. except:
  357. pass
  358.  
  359.  
  360. # remove duplicates from the sheet
  361. def removeDuplicatesinSheet():
  362. df = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\newBatch_nodup.csv", error_bad_lines=False)
  363. df = df.drop_duplicates(subset='email')
  364. df.sort_index(axis='columns', inplace=True, ascending=False)
  365. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\Batch_upload.csv", index=False)
  366.  
  367.  
  368. # call after the output file is done
  369. def mergeToMainCsv():
  370. csv.field_size_limit(2000000000)
  371. os.chdir(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch")
  372. extension = 'csv'
  373. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  374. sep = ","
  375. first = 0
  376. with open(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch\Batch.csv", "a+",
  377. encoding="utf-8-sig") as targetfile:
  378. for filename in all_filenames:
  379. with open(
  380. r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch\\" + filename,
  381. "r", encoding="utf-8-sig") as f:
  382. # csvreader=csv.reader(f,delimiter=",")
  383. print(filename)
  384. if first != 0:
  385. next(f) # << only if the first line contains headers
  386. for line in f:
  387. targetfile.write(line)
  388. first = first + 1
  389.  
  390.  
  391. # add scraped_from column at the end and specify datatypes
  392. def cleanData():
  393. os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel")
  394. extension = 'csv'
  395. keepColumns = ["userId", "username", "full_name", "follower_count", "following_count", "media_count", "biography",
  396. "hasProfilePic",
  397. "external_url", "email", "contact_phone_number", "address_street", "isbusiness", "Engagement %",
  398. "AvgLikes", "AvgComments",
  399. "MostRecentPostDate", "category", "cityName", "businessJoinDate", "businessCountry", "businessAds",
  400. "countryCode"]
  401.  
  402. intListTypes = ["follower_count", "following_count", "media_count"]
  403. floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
  404. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  405.  
  406. for f in all_filenames:
  407. name = f.split('-')
  408. df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
  409. print("Current file " + name[0])
  410. print("Country: " + name[1] + ", niche: " + name[2])
  411. count_row = df.shape[0]
  412. count_column = df.shape[1]
  413. print("Total rows is %d", count_row)
  414. print("Total columns atm %d", count_column)
  415.  
  416. print("Stripping columns from /r/n")
  417. for column in df.columns:
  418. stripped = column.strip()
  419. df.rename(columns={column: stripped}, inplace=True)
  420.  
  421. # deleting empty and not needed columns
  422. keepColumnsLower = [item.lower() for item in keepColumns]
  423. for column in df.columns.str.lower():
  424. if column.strip() not in keepColumnsLower:
  425. if 'public_email' in column:
  426. print("Renaming column " + column)
  427. df.rename(columns={column: 'email'}, inplace=True)
  428. else:
  429. print("Deleting " + column)
  430. del df[column]
  431.  
  432. # adding missing columns for database
  433. # print("Adding missing columns")
  434. for column in keepColumns:
  435. lower = column.lower()
  436. strip_list = [item.strip() for item in df.columns.str.lower()]
  437. if lower not in strip_list:
  438. print("Added column: " + column)
  439. df[column] = ""
  440.  
  441. dates = ["12/01/2019", "12/02/2019", "12/03/2019", "12/04/2019", "12/05/2019", "12/06/2019", "12/07/2019",
  442. "12/08/2019", "12/09/2019", "12/10/2019", "12/11/2019", "12/12/2019", "12/13/2019", "12/14/2019",
  443. "12/15/2019", "12/16/2019", "12/17/2019", "12/18/2019", "12/19/2019", "12/20/2019", "12/21/2019",
  444. "12/22/2019", "12/23/2019", "12/24/2019", "12/25/2019", "12/26/2019", "12/27/2019", "12/28/2019",
  445. "12/29/2019", "12/30/2019", "11/01/2019", "11/02/2019", "11/03/2019", "11/04/2019", "11/05/2019",
  446. "11/06/2019", "11/07/2019", "11/08/2019", "11/09/2019", "11/10/2019", "11/11/2019", "11/12/2019",
  447. "11/13/2019", "11/14/2019", "11/15/2019", "11/16/2019", "11/17/2019", "11/18/2019", "11/19/2019",
  448. "11/20/2019", "11/21/2019", "11/22/2019", "11/23/2019", "11/24/2019", "11/25/2019", "11/26/2019",
  449. "11/27/2019", "11/28/2019", "11/29/2019", "11/30/2019"]
  450.  
  451. # add columns
  452. df['scraped_from'] = name[0]
  453. df['source_country'] = name[1]
  454. df['source_niche'] = name[2]
  455. df['date_scraped'] = random.choice(dates)
  456. # df['date_scraped'] = today = datetime.datetime.now().strftime("%d/%m/%y")
  457.  
  458. # replacing , with ;
  459. stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
  460. # stringColumnsLower = [item.lower() for item in keepColumns]
  461. for column in stringColumns:
  462. if column.lower().strip() in df.columns.str.lower():
  463. df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
  464. print("Replaced , & ; in column " + column)
  465.  
  466. # sort columns
  467. df.sort_index(axis='columns', inplace=True, ascending=False)
  468.  
  469. # specify data types
  470. for type in intListTypes:
  471. try:
  472. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
  473. except:
  474. print("Converting type error " + type)
  475. for type in floatListTypes:
  476. try:
  477. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
  478. except:
  479. print("Converting type error " + type)
  480.  
  481. # df.zipcode.astype(str)
  482.  
  483. count_column = df.shape[1]
  484. print("Total columns atm %d", count_column)
  485. if count_column != 27:
  486. print("Columns count should be 27, check if everything is okay!")
  487. print("Datatypes")
  488. print(df.dtypes)
  489. df['email'].replace('', np.nan, inplace=True)
  490. df.dropna(subset=['email'], inplace=True)
  491. # save dataframe in a new file
  492. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\\" + name[0] + "_database.csv", index=False,
  493. encoding="utf-8-sig")
  494. # remove the old file
  495. # os.remove(f)
  496.  
  497.  
  498. def printUsernames():
  499. list = ["adelemaree", "aianational", "albuslumen", "aleciataryn_fitforlife", "alenpalander", "alicemcnails",
  500. "allblacks7s", "alliemtaylor", "allyfashion", "amber.blaxland", "amysheppardpie", "amy_hetherington",
  501. "andrewtkearns", "anita_ghise", "anna.davey", "antoniagigovskaa", "antonijalokvenec", "archdaily",
  502. "archdigest", "architectmag", "ariarne_lepine", "ashleymescia", "asiyami_gold", "aumi", "azaleamodels",
  503. "azlinnicolette", "baffmasta", "Batch.csv", "beachbunnyswimwear", "bel_sloane", "bjarkeingels",
  504. "blogbeautyglamfashion", "bondeyeswim", "bondibather", "bonnieandersonmusic", "boys_run_free", "bran.wolf",
  505. "brandonhardbody", "brandonwoelfel", "breakfastinsydney", "bricharismakeup", "bryanadamc", "bubs2bikinis",
  506. "cairo_dwek", "calsnape", "cassimanner", "changing_habits", "charlbi143", "chrismelberger", "conagh_kat",
  507. "cyd_morris", "damon_baker", "danidiamondphotography", "danielinskeep", "dannylomass", "denim.archives",
  508. "dexthefreak", "dezeen", "dfreske", "dmv.makeupartistry", "duskiiactive", "eivilocs", "ejmaxwell",
  509. "elghazy_hamza", "eliasriadi", "ellymiles", "eloratahiti", "emileemacc", "emmachenartistry",
  510. "emmahyndman_theposingpro", "enfantsrichesdeprimes", "entrepreneur", "erinscottbeauty", "esmedewitt",
  511. "estefaniac2t", "facehalo", "flightfac", "forevernew_official", "fujifilmx_au", "fursty", "gallucks",
  512. "garyvee", "georgychernyadyev", "gianlucavacchi", "glamgirlamy", "gp.yt", "grenadeofficial", "halotopuk",
  513. "harvardgsd", "hotmess", "housesaddictive", "icingandglitter", "igkansascity", "imogencaldwell", "ioegreer",
  514. "irenenoren", "irenerudnykphoto", "itswillcarmack", "jade_packer", "jaivasicek", "jake_of_all_trades",
  515. "jamescharles", "jannikobenhoff", "janny.organically", "jasmine__clarke", "jelenamarkovic___",
  516. "jerichoroadclothing", "jessedriftwood", "jessicajane_au", "jessicakobeissi", "jetsswimwear", "johnpawson",
  517. "jordanpeele", "juvahn.victoria", "kai.boet", "kalula_tattoo", "karimamckimmie", "katrikats", "katysurfs",
  518. "kaylah.model", "keeganmichaelkey", "kehaulanisanares", "kerrysexton_ifbbpro", "kim_akrich", "kirstycane",
  519. "kivari_the_label", "kuwaii", "leahjay_", "lex.weinstein", "lildenimjean", "lioninthewild",
  520. "livelovebeirut", "longbeach_griffy", "lucentement", "lukasabbat", "lukehinesonline", "maanonu",
  521. "maisonsdumonde_es", "malakaifekitoa"]
  522. os.chdir(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output")
  523. extension = 'csv'
  524. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  525. i = 0
  526. for f in all_filenames:
  527. if 'database' in f:
  528. name = f.split("_database")
  529. else:
  530. name = f.split('-')
  531. i = i + 1
  532. print(name[0])
  533. print("Total number %d", i)
  534.  
  535.  
  536. def singleChangeCommas():
  537. df = pd.read_csv(r"C:\Users\Administrator\Desktop\test\emails28SPLITTED.csv", error_bad_lines=False,
  538. lineterminator='\n', low_memory=False)
  539. stringColumns = ["cityName"]
  540. stringColumnsPhone = ["contact_phone_number"]
  541.  
  542. for column in stringColumns:
  543. if column.lower().strip() in df.columns.str.lower():
  544. # df[column] = '"' + df[column] + '"'
  545. df[column] = df[column].str.replace("'", '')
  546. df[column] = df[column].str.replace("[", '')
  547. df[column] = df[column].str.replace("]", '')
  548. print("Replaced , & ; in column " + column)
  549.  
  550. print("Fixing columns")
  551.  
  552. for column in df.columns:
  553. stripped = column.strip()
  554. df.rename(columns={column: stripped}, inplace=True)
  555. df.to_csv(r"C:\Users\Administrator\Desktop\Profiles for database\New folder\Batches\uploadDatabaseNEW.csv",
  556. encoding="utf-8-sig", index=False)
  557.  
  558.  
  559. def printDtypes():
  560. df = pd.read_csv(r"C:\Users\Administrator\Desktop\Profiles for database\Site\Output\Third100\Third100.csv",
  561. error_bad_lines=False, lineterminator='\n', low_memory=False)
  562. print(df.dtypes)
  563. print("Fixing column names")
  564. for column in df.columns:
  565. stripped = column.strip()
  566. df.rename(columns={column: stripped}, inplace=True)
  567.  
  568. stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
  569. for column in stringColumns:
  570. if column.lower().strip() in df.columns.str.lower():
  571. df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
  572. print("Replaced , & ; in column " + column)
  573.  
  574. df.to_csv(r"C:\Users\Administrator\Desktop\Profiles for database\Site\Output\Third100\Third100_database.csv",
  575. index=False, encoding="utf-8-sig")
  576. print(df.dtypes)
  577.  
  578.  
  579. def addScrapedColumn():
  580. os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
  581. extension = 'csv'
  582. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  583. intListTypes = ["follower_count", "following_count", "media_count"]
  584. floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
  585. for f in all_filenames:
  586. name = f.split('_')
  587. df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
  588. print("Current file " + name[0])
  589. df['scraped_from'] = name[0]
  590. print("Stripping columns from /r/n")
  591. for column in df.columns:
  592. stripped = column.strip()
  593. df.rename(columns={column: stripped}, inplace=True)
  594. # specify data types
  595. for type in intListTypes:
  596. try:
  597. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
  598. except:
  599. print("Converting type error " + type)
  600. for type in floatListTypes:
  601. try:
  602. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
  603. except:
  604. print("Converting type error " + type)
  605.  
  606. df.sort_index(axis='columns', inplace=True, ascending=False)
  607. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data\\" + name[0] + "-followers.csv",
  608. index=False, encoding="utf-8-sig")
  609.  
  610.  
  611. # only emails
  612. def leaveOnlyEmails():
  613. os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
  614. extension = 'csv'
  615. keepColumns = ["userId", "username", "full_name", "follower_count", "following_count", "media_count", "biography",
  616. "hasProfilePic",
  617. "external_url", "email", "contact_phone_number", "address_street", "isbusiness", "Engagement %",
  618. "AvgLikes", "AvgComments",
  619. "MostRecentPostDate", "category", "cityName", "businessJoinDate", "businessCountry", "businessAds",
  620. "countryCode"]
  621.  
  622. intListTypes = ["follower_count", "following_count", "media_count"]
  623. floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
  624. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  625.  
  626. for f in all_filenames:
  627. name = f.split('_')
  628. df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
  629. print("Current file " + name[0])
  630. print("Stripping columns from /r/n")
  631. for column in df.columns:
  632. stripped = column.strip()
  633. df.rename(columns={column: stripped}, inplace=True)
  634.  
  635. # deleting empty and not needed columns
  636. keepColumnsLower = [item.lower() for item in keepColumns]
  637. for column in df.columns.str.lower():
  638. if column.strip() not in keepColumnsLower:
  639. if 'public_email' in column:
  640. print("Renaming column " + column)
  641. df.rename(columns={column: 'email'}, inplace=True)
  642. else:
  643. print("Deleting " + column)
  644. del df[column]
  645.  
  646. # adding missing columns for database
  647. # print("Adding missing columns")
  648. for column in keepColumns:
  649. lower = column.lower()
  650. strip_list = [item.strip() for item in df.columns.str.lower()]
  651. if lower not in strip_list:
  652. print("Added column: " + column)
  653. df[column] = ""
  654.  
  655. # replacing , with ;
  656. stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
  657. # stringColumnsLower = [item.lower() for item in keepColumns]
  658. for column in stringColumns:
  659. if column.lower().strip() in df.columns.str.lower():
  660. df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
  661. print("Replaced , & ; in column " + column)
  662.  
  663. # sort columns
  664. df.sort_index(axis='columns', inplace=True, ascending=False)
  665.  
  666. # specify data types
  667. for type in intListTypes:
  668. try:
  669. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
  670. except:
  671. print("Converting type error " + type)
  672. for type in floatListTypes:
  673. try:
  674. df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
  675. except:
  676. print("Converting type error " + type)
  677.  
  678. # df.zipcode.astype(str)
  679.  
  680. count_column = df.shape[1]
  681. print("Total columns atm %d", count_column)
  682. if count_column != 27:
  683. print("Columns count should be 27, check if everything is okay!")
  684. print("Datatypes")
  685. print(df.dtypes)
  686. df['email'].replace('', np.nan, inplace=True)
  687. df.dropna(subset=['email'], inplace=True)
  688. # save dataframe in a new file
  689. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data\\" + name[0] + "_onlyEmails.csv",
  690. index=False, encoding="utf-8-sig")
  691. # remove the old file
  692. # os.remove(f)
  693.  
  694.  
  695. def deleteByKeyword():
  696. os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
  697. extension = 'csv'
  698. all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
  699. for f in all_filenames:
  700. if "onlyEmails" in f:
  701. print("Keeping " + f)
  702. else:
  703. print("Deleting " + f)
  704. os.remove(f)
  705.  
  706.  
  707. def matchSplitCsv():
  708. csv.field_size_limit(2000000000)
  709. # sys.stdout = open("goat.csv", "w")
  710. mainList = []
  711. with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\emails.csv", encoding="utf-8-sig",
  712. errors='ignore') as csvfile, \
  713. open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\splitEmails.csv", 'w+', newline="",
  714. encoding="utf-8-sig") as outputfile:
  715. readCSV = csv.reader(csvfile, delimiter=',')
  716. writer = csv.writer(outputfile, delimiter=",")
  717. writer.writerow(next(readCSV))
  718. tempList = []
  719. i = 0
  720. for row in readCSV:
  721. if (i < 700):
  722. tempList.append(row)
  723. i = i + 1
  724. else:
  725. mainList.append(tempList)
  726. tempList = []
  727. tempList.append(row)
  728. i = 1
  729.  
  730. if len(tempList) > 0:
  731. mainList.append(tempList)
  732.  
  733. df = pd.DataFrame.from_records(mainList)
  734.  
  735. df = df.transpose()
  736. for column in df.columns:
  737. df[column] = df[column].astype(str).str.replace("'", '')
  738. df[column] = df[column].astype(str).str.replace("[", '')
  739. df[column] = df[column].astype(str).str.replace("]", '')
  740. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\splitEmails.csv", index=False)
  741.  
  742.  
  743. def sortColumns():
  744. df = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\boutinelastudio_followers.csv",
  745. error_bad_lines=False, lineterminator='\n', low_memory=False)
  746. df.sort_index(axis='columns', inplace=True, ascending=False)
  747. df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\boutinelastudio_followers_sorted.csv",
  748. index=False)
  749.  
  750.  
  751. mainFunc()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement