Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- # 1) addScrapedColumn() & also specify datatypes for all columns
- # 2) combine all files in one with mergeToMainCsv()
- # 3) remove all duplicate emails & empty rows from the sheet with removeDuplicatesInSheet()
- # 4) remove all duplicate emails from previous batches removeDuplicates() - skip if new client
- # 5) filter the emails mainFunc()
- # 6) add emails to previous batches sheet mergeAllBatches() - skip if new client
- # VALIDATE
- # 7) remove all columns from NewBatch except 'email' and upload to debounce
- # 8) merge validated emails with batch matchEmailsAfterDeBounce()
- # 9) upload google sheet to client
- # -*- coding: utf-8 -*-
- import requests, json
- import datetime
- import csv
- import os
- import glob
- import pandas as pd
- import sys
- import http.client
- import re
- import sys
- import numpy as np
- import random
- # male names
- def isMale(name):
- data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\MaleNames\male_names1.csv",
- encoding="ISO-8859-1")
- df = pd.DataFrame(data)
- maleNames = df['boyNames'].unique().tolist()
- if name.lower() in maleNames:
- return 1
- return 0
- def doesTheEmailStartWithMaleName(email):
- data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\MaleNames\male_names1.csv",
- encoding="ISO-8859-1")
- df = pd.DataFrame(data)
- maleNames = df['boyNames'].unique().tolist()
- for maleName in maleNames:
- if len(maleName) < 4 or len(email) < 4:
- continue
- if email.lower().startswith(maleName.lower(), 0, len(maleName)):
- # print("BOY NAME FOUND!!, " + maleName + ", a mejlot e ," + email)
- return 1
- return 0
- # merge current batch to all previous batches
- def mergeAllBatches():
- os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\AllBatches")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- # combine all files in the list
- combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
- # export to csv
- combined_csv.to_csv("combined_csv.csv", index=False, encoding="utf-8")
- # match valid emails from debounce
- def matchEmails(email):
- with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\emails.csv", 'r') as validEmailList:
- emailList = csv.reader(validEmailList, delimiter=',')
- for row in emailList:
- emailContains = 0
- if row[0].lower() == email.lower():
- emailContains = 1
- break
- if emailContains == 1:
- return 1
- else:
- return 0
- # NewBatch - current batch
- def matchEmailsAfterDeBounce():
- csv.field_size_limit(200000000)
- with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\combined_nodup.csv", encoding="utf-8-sig",
- errors='ignore') as csvfile, \
- open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\NewBatch_UPLOAD.csv", 'w+',
- encoding="utf-8-sig", newline="") as outputfile:
- print("Otvoreni files")
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=',')
- writer.writerow(next(readCSV))
- global i
- i = 0
- for row in readCSV:
- if matchEmails(row[9]):
- writer.writerow(row)
- i = i + 1
- print("TOTAL EMAIL IS %d", i)
- # called in cleanWithFilters()
- def checkEmailPattern(email):
- listOfUnwantedEmails = ['2015', '2016', 'noreply', '2017', '2018', '2019', '2020', 'abuse', 'academy',
- 'accessibility', 'account', 'accountant', 'accounting', 'accountmanager', 'accountmanagers',
- 'accounts', 'accountspayable', 'acquisition', 'admin', 'admin1', 'administracao',
- 'administracion', 'administrador', 'administratie', 'administratif', 'administration',
- 'administrativo', 'administrator', 'administrators', 'admins', 'adminteam', 'admissions',
- 'adops', 'ads', 'adventure', 'advertise', 'advertising', 'advertisingsales', 'advice',
- 'advisor', 'advisors', 'adwords', 'affiliate', 'affiliates', 'agence', 'agencia', 'agency',
- 'agents', 'alarm', 'alarms', 'alert', 'alerts', 'alexa', 'all', 'all-employees', 'all-pms',
- 'all-staff', 'all-team', 'all-users', 'all.employees', 'all.staff', 'all.users',
- 'all_staff', 'alla', 'alle', 'allemployees', 'allhands', 'allsales', 'allstaff',
- 'allstudents', 'allteachers', 'allteam', 'allusers', 'alpha', 'alphas', 'alumni',
- 'ambassadors', 'amministrazione', 'analysts', 'analytics', 'android', 'angels', 'animation',
- 'announce', 'announcements', 'ap', 'api', 'app', 'apple', 'application', 'applications',
- 'apply', 'appointments', 'apps', 'archives', 'asistente', 'asset', 'assistanthead',
- 'assistencia', 'assistenza', 'associates', 'associates-all', 'ateam', 'atencionalcliente',
- 'atendimento', 'auctions', 'available', 'backend', 'backend-dev', 'backup', 'bd',
- 'benefits', 'berlin', 'bestellung', 'beta', 'biblioteca', 'bibliotheque', 'billing',
- 'bills', 'biuro', 'biz', 'bizdev', 'blog', 'board', 'bod', 'bookclub', 'booking',
- 'bookings', 'boston', 'boxoffice', 'brand', 'branding', 'brands', 'brandsolutions',
- 'broadcast', 'buchhaltung', 'bugs', 'build', 'bursar', 'busdev', 'business',
- 'business_team', 'businessdevelopment', 'ca', 'caltrain', 'campaign', 'campaigns',
- 'campusteam', 'capacitacion', 'captain', 'captains', 'care', 'career', 'careers',
- 'catering', 'central', 'centro', 'ceo', 'ceos', 'channel-sales', 'chat', 'chatter', 'chef',
- 'chicago', 'china', 'citymanagers', 'classof2016', 'classof2017', 'classof2018',
- 'classof2019', 'classroom_teachers', 'client', 'clientes', 'clients', 'clientservices',
- 'clinic', 'cloud', 'cm', 'co-op', 'coach', 'coaches', 'coaching', 'code', 'colaboradores',
- 'colegio', 'com', 'comenzi', 'comercial', 'comercial1', 'comercial2', 'comments',
- 'commercial', 'commerciale', 'commissions', 'committee', 'comms', 'communication',
- 'communications', 'community', 'community', 'company', 'company.wide', 'compete',
- 'competition', 'compliance', 'compras', 'compta', 'comptabilite', 'comunicacao',
- 'comunicacion', 'comunicaciones', 'comunicazione', 'concierge', 'conference', 'connect',
- 'consultant', 'consultas', 'consulting', 'consultoria', 'contabil', 'contabilidad',
- 'contabilidade', 'contabilita', 'contact', 'contactenos', 'contacto', 'contactus',
- 'contador', 'contato', 'content', 'contractor', 'contractors', 'contracts', 'controller',
- 'coordinator', 'copyright', 'core', 'coreteam', 'corp', 'corporate', 'corporatesales',
- 'council', 'courrier', 'creative', 'crew', 'crm', 'cs', 'csm', 'csteam', 'cultura',
- 'culture', 'customer', 'customer.service', 'customercare', 'customerfeedback', 'customers',
- 'customerservice', 'customerservicecenter', 'customerservices', 'customersuccess',
- 'customersupport', 'custserv', 'daemon', 'data', 'database', 'deals', 'dean', 'delivery',
- 'demo', 'denver', 'departures', 'deploy', 'deputy', 'deputyhead', 'design', 'designer',
- 'designers', 'dev', 'developer', 'developers', 'development', 'devnull', 'devops', 'devs',
- 'devteam', 'digital', 'digsitesvalue', 'direccion', 'direction', 'director', 'directors',
- 'directory', 'diretoria', 'direzione', 'discuss', 'dispatch', 'diversity', 'dns', 'docs',
- 'domain', 'domainmanagement', 'domains', 'donations', 'donors', 'download', 'dreamteam',
- 'ecommerce', 'editor', 'editorial', 'editors', 'education', 'einkauf', 'email', 'emergency',
- 'employee', 'employees', 'employment', 'eng', 'eng-all', 'engagement', 'engineering',
- 'engineers', 'english', 'enq', 'enquire', 'enquires', 'enquiries', 'enquiry', 'enrollment',
- 'enterprise', 'equipe', 'equipo', 'error', 'errors', 'escritorio', 'europe', 'event',
- 'events', 'everybody', 'everyone', 'exec', 'execs', 'execteam', 'executive', 'executives',
- 'expenses', 'expert', 'experts', 'export', 'facilities', 'facturacion', 'faculty', 'family',
- 'farmacia', 'faturamento', 'fax', 'fbl', 'feedback', 'fellows', 'finance', 'financeiro',
- 'financeiro2', 'finanzas', 'firmapost', 'fiscal', 'food', 'football', 'founders', 'france',
- 'franchise', 'friends', 'frontdesk', 'frontend', 'frontoffice', 'fte', 'ftp', 'fulltime',
- 'fun', 'fundraising', 'gardner', 'geeks', 'general', 'geral', 'giving', 'global', 'grants',
- 'graphics', 'group', 'growth', 'hackathon', 'hackers', 'head', 'head.office', 'headoffice',
- 'heads', 'headteacher', 'hello', 'help', 'helpdesk', 'hi', 'highschool', 'hiring', 'hola',
- 'home', 'homes', 'hosting', 'hostmaster', 'hotel', 'house', 'hq', 'hr', 'hrdept', 'hsstaff',
- 'hsteachers', 'humanresources', 'ideas', 'implementation', 'import', 'inbound', 'inbox',
- 'india', 'info', 'infor', 'informacion', 'informatica', 'information', 'informatique',
- 'informativo', 'infra', 'infrastructure', 'ingenieria', 'innovation', 'inoc', 'inquiries',
- 'inquiry', 'insidesales', 'insights', 'instagram', 'insurance', 'integration',
- 'integrations', 'intern', 'internal', 'international', 'internet', 'interns', 'internship',
- 'invest', 'investment', 'investor', 'investorrelations', 'investors', 'invoice', 'invoices',
- 'invoicing', 'ios', 'iphone', 'ir', 'ispfeedback', 'ispsupport', 'it', 'ithelp',
- 'itsupport', 'itunes', 'jira', 'job', 'jobs', 'join', 'jornalismo', 'junk', 'kontakt',
- 'kundeservice', 'la', 'lab', 'laboratorio', 'labs', 'ladies', 'latam', 'launch', 'lead',
- 'leaders', 'leadership', 'leadership-team', 'leadershipteam', 'leads', 'leasing', 'legal',
- 'letters', 'library', 'licensing', 'links', 'list', 'list-request', 'login', 'logistica',
- 'logistics', 'logistiek', 'lt', 'lunch', 'mail', 'mailbox', 'maildaemon', 'mailer-daemon',
- 'mailerdaemon', 'mailing', 'maintenance', 'management', 'management-group',
- 'management.team', 'management_team', 'manager', 'managers', 'marketing', 'marketing-ops',
- 'marketing-team', 'marketingteam', 'marketplace', 'master', 'mayor', 'md', 'media',
- 'meetup', 'member', 'members', 'membership', 'mentors', 'metrics', 'mgmt', 'middleschool',
- 'misc', 'mkt', 'mktg', 'mobile', 'monitor', 'monitoring', 'montreal', 'msstaff',
- 'msteachers', 'mt', 'music', 'network', 'newbiz', 'newbusiness', 'news', 'newsletter',
- 'newyork', 'nntp', 'no-reply', 'no.replay', 'no.reply', 'nobody', 'noc', 'none', 'noreply',
- 'noresponse', 'northamerica', 'nospam', 'notes', 'notifications', 'notify', 'nps', 'null',
- 'ny', 'nyc', 'nyoffice', 'offboarding', 'offers', 'office', 'officeadmin', 'officemanager',
- 'officers', 'officestaff', 'offtopic', 'oficina', 'onboarding', 'online', 'onsite', 'ooo',
- 'operaciones', 'operations', 'ops', 'order', 'orders', 'ordini', 'outage', 'outreach',
- 'owners', 'parents', 'paris', 'partner', 'partners', 'partnerships', 'parts', 'pay',
- 'payment', 'payments', 'paypal', 'payroll', 'pd', 'people', 'peoplemanagers', 'peopleops',
- 'performance', 'personnel', 'phish', 'phishing', 'photos', 'planning', 'platform', 'pm',
- 'portfolio', 'post', 'postbox', 'postfix', 'postmaster', 'ppc', 'pr', 'prefeitura',
- 'presales', 'presidencia', 'president', 'presidente', 'press', 'presse', 'prime',
- 'principal', 'principals', 'privacy', 'procurement', 'prod', 'produccion', 'product',
- 'product-team', 'product.growth', 'product.management', 'product.managers', 'product.team',
- 'production', 'productmanagers', 'products', 'productteam', 'produto', 'program',
- 'programs', 'project', 'projectmanagers', 'projects', 'promo', 'promotions', 'protocollo',
- 'proveedores', 'publicidade', 'publisher', 'publishers', 'purchase', 'purchases',
- 'purchasing', 'qa', 'qualidade', 'questions', 'quotes', 'random', 'realestate', 'receipts',
- 'recepcion', 'reception', 'receptionist', 'recruit', 'recruiter', 'recruiters',
- 'recruiting', 'recruitment', 'recrutement', 'recursoshumanos', 'redacao', 'redaccion',
- 'redaction', 'redazione', 'referrals', 'register', 'registrar', 'registration',
- 'relacionamento', 'release', 'releases', 'remote', 'remove', 'rentals', 'report',
- 'reporting', 'reports', 'request', 'requests', 'research', 'reservaciones', 'reservas',
- 'reservation', 'reservations', 'residents', 'response', 'restaurant', 'resume', 'resumes',
- 'retail', 'returns', 'revenue', 'rezervari', 'rfp', 'rnd', 'rockstars', 'root', 'rrhh',
- 'rsvp', 'sales', 'sales-team', 'sales.team', 'sales1', 'sales2', 'salesengineers',
- 'salesforce', 'salesops', 'salesteam', 'sanfrancisco', 'school', 'schooloffice', 'science',
- 'sdr', 'se', 'search', 'seattle', 'secretaria', 'secretariaat', 'secretaris', 'secretary',
- 'security', 'sekretariat', 'sem', 'seniors', 'seo', 'server', 'service', 'serviceclient',
- 'servicedesk', 'services', 'servicioalcliente', 'sf', 'sf-office', 'sfo', 'sfoffice',
- 'sfteam', 'shareholders', 'shipping', 'shop', 'shopify', 'shopping', 'signup', 'signups',
- 'singapore', 'sistemas', 'site', 'smtp', 'social', 'socialclub', 'socialmedia', 'socios',
- 'software', 'solutions', 'soporte', 'sos', 'spam', 'sponsorship', 'sport', 'squad', 'staff',
- 'startups', 'stats', 'stockholm', 'store', 'stories', 'strategy', 'stripe', 'student',
- 'students', 'studio', 'submissions', 'submit', 'subscribe', 'subscriptions', 'success',
- 'suggestions', 'supervisor', 'supervisors', 'suporte', 'supply', 'support', 'support-team',
- 'supportteam', 'suprimentos', 'sydney', 'sysadmin', 'system', 'systems', 'ta', 'talent',
- 'tax', 'teachers', 'team', 'teamleaders', 'teamleads', 'tech', 'technical', 'technik',
- 'technology', 'techops', 'techsupport', 'techteam', 'tecnologia', 'tesoreria', 'test',
- 'testgroup', 'testing', 'the.principal', 'theoffice', 'theteam', 'tickets', 'time',
- 'timesheets', 'todos', 'tools', 'tour', 'trade', 'trainers', 'training', 'transport',
- 'travel', 'treasurer', 'tribe', 'trustees', 'turismo', 'twitter', 'uk',
- 'undisclosed-recipients', 'unsubscribe', 'update', 'updates', 'us', 'usa', 'usenet', 'user',
- 'users', 'usteam', 'uucp', 'ux', 'vendas', 'vendas1', 'vendas2', 'vendor', 'vendors',
- 'ventas', 'ventas1', 'ventas2', 'verkauf', 'verwaltung', 'video', 'vip', 'voicemail',
- 'volunteer', 'volunteering', 'volunteers', 'vorstand', 'warehouse', 'watercooler', 'web',
- 'webadmin', 'webdesign', 'webdev', 'webinars', 'webmaster', 'website', 'webteam', 'welcome',
- 'whois', 'wholesale', 'women', 'wordpress', 'work', 'workshop', 'writers', 'www',
- 'zentrale']
- listOfUnwantedProviders = ['ru', 'bg']
- emailBegins = email.split('@')[0]
- emailEnds = email.split('.')[-1]
- if '@' not in email:
- return 1
- if doesTheEmailStartWithMaleName(emailBegins):
- print("Email boy name " + email)
- return 1
- if emailEnds.lower() in listOfUnwantedProviders:
- return 1
- if emailBegins.lower() not in listOfUnwantedEmails:
- return 0
- else:
- return 1
- # called in cleanWithFilters()
- def excludeBio(line):
- excludedWords = ["specialty", "crafted", "supplies", "domestic", "website", "order", "orders", "$", "£", "free",
- "worldwide", "wide", "ship", "shipping", "jewelry", "men", "men's", "guy", "guy's", "business",
- "days",
- "return", "returns", "exchange", "exchanges", "make up", "MUA", "mua", "make ", "photography",
- "ethical",
- "hand", "made", "handmade", "hand made", "knit", "knitwear", "beauty", "lux", "luxury", "designed",
- "designed in",
- "to get featured", "tag", "apparel", "sustainable", "made in", "made", "online boutique",
- "boutique", "online", "photographer", "photography"]
- line = line.replace('\n', '')
- line = line.replace('\t', '')
- lineList = line.split(" ")
- valsLower = [item.lower() for item in lineList]
- l3 = [x for x in valsLower if x not in excludedWords]
- if len(valsLower) != len(l3):
- return 1
- else:
- return 0
- # called in mainFunc()
- def cleanWithFilters(row):
- isOK = 1
- try:
- if not row[10]: # nema email
- print("No email")
- isOK = 0
- # elif row[7].strip() == "" or int(row[7].split(".")[0]) > 3500: # following count
- # print("Following " + row[1] + " " + row[7])
- # isOK = 0
- # elif (row[8].strip() == "" or int(row[8].split(".")[0]) < 30) or int(
- # row[8].split(".")[0]) > 50000: # followers
- # print("Followers " + row[1] + " " + row[8])
- # isOK = 0
- # elif row[2].strip() == "" or int(row[2].split(".")[0]) < 1: # media count
- # print("Media count " + row[2].strip())
- # isOK = 0
- elif checkEmailPattern(row[10]): # email pattern-ot ne e dobar
- print("Email pattern " + row[10])
- isOK = 0
- except:
- isOK = 0
- pass
- try:
- if isMale(row[6].split(' ')[0]):
- print("MALE " + row[6].split(" ")[0])
- isOK = 0
- except:
- pass
- return isOK
- # filter the data based on the client's filters
- def mainFunc():
- csv.field_size_limit(2000000000)
- # sys.stdout = open("goat.csv", "w")
- with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\NewBatch.csv", encoding="utf-8-sig",
- errors='ignore') as csvfile, \
- open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\FilteredBatch.csv", 'w+', newline="",
- encoding="utf-8-sig") as outputfile:
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=",")
- writer.writerow(next(readCSV))
- global i
- i = 0
- for row in readCSV:
- if cleanWithFilters(row):
- writer.writerow(row)
- i = i + 1
- print("TOTAL EMAIL IS %d", i)
- # remove duplicates from all previous batches compared to this one
- def removeAllDuplicates():
- csv.field_size_limit(200000000)
- data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\AllBatches\combined_csv.csv")
- df = pd.DataFrame(data)
- emailList = df['email'].unique().tolist()
- with open('NewBatch.csv', 'r', encoding="utf-8-sig") as csvfile, \
- open('FilteredBatch.csv', 'w+', newline="", encoding="utf-8-sig") as outputfile:
- csv.field_size_limit(200000000)
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=",")
- writer.writerow(next(readCSV))
- for row in readCSV:
- try:
- if row[8] in emailList:
- print("Duplicate " + row[8])
- continue
- writer.writerow(row)
- except:
- pass
- # remove duplicates from the sheet
- def removeDuplicatesinSheet():
- df = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\newBatch_nodup.csv", error_bad_lines=False)
- df = df.drop_duplicates(subset='email')
- df.sort_index(axis='columns', inplace=True, ascending=False)
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\Batch_upload.csv", index=False)
- # call after the output file is done
- def mergeToMainCsv():
- csv.field_size_limit(2000000000)
- os.chdir(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- sep = ","
- first = 0
- with open(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch\Batch.csv", "a+",
- encoding="utf-8-sig") as targetfile:
- for filename in all_filenames:
- with open(
- r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output\CetvrtBatch\\" + filename,
- "r", encoding="utf-8-sig") as f:
- # csvreader=csv.reader(f,delimiter=",")
- print(filename)
- if first != 0:
- next(f) # << only if the first line contains headers
- for line in f:
- targetfile.write(line)
- first = first + 1
- # add scraped_from column at the end and specify datatypes
- def cleanData():
- os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel")
- extension = 'csv'
- keepColumns = ["userId", "username", "full_name", "follower_count", "following_count", "media_count", "biography",
- "hasProfilePic",
- "external_url", "email", "contact_phone_number", "address_street", "isbusiness", "Engagement %",
- "AvgLikes", "AvgComments",
- "MostRecentPostDate", "category", "cityName", "businessJoinDate", "businessCountry", "businessAds",
- "countryCode"]
- intListTypes = ["follower_count", "following_count", "media_count"]
- floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- for f in all_filenames:
- name = f.split('-')
- df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
- print("Current file " + name[0])
- print("Country: " + name[1] + ", niche: " + name[2])
- count_row = df.shape[0]
- count_column = df.shape[1]
- print("Total rows is %d", count_row)
- print("Total columns atm %d", count_column)
- print("Stripping columns from /r/n")
- for column in df.columns:
- stripped = column.strip()
- df.rename(columns={column: stripped}, inplace=True)
- # deleting empty and not needed columns
- keepColumnsLower = [item.lower() for item in keepColumns]
- for column in df.columns.str.lower():
- if column.strip() not in keepColumnsLower:
- if 'public_email' in column:
- print("Renaming column " + column)
- df.rename(columns={column: 'email'}, inplace=True)
- else:
- print("Deleting " + column)
- del df[column]
- # adding missing columns for database
- # print("Adding missing columns")
- for column in keepColumns:
- lower = column.lower()
- strip_list = [item.strip() for item in df.columns.str.lower()]
- if lower not in strip_list:
- print("Added column: " + column)
- df[column] = ""
- dates = ["12/01/2019", "12/02/2019", "12/03/2019", "12/04/2019", "12/05/2019", "12/06/2019", "12/07/2019",
- "12/08/2019", "12/09/2019", "12/10/2019", "12/11/2019", "12/12/2019", "12/13/2019", "12/14/2019",
- "12/15/2019", "12/16/2019", "12/17/2019", "12/18/2019", "12/19/2019", "12/20/2019", "12/21/2019",
- "12/22/2019", "12/23/2019", "12/24/2019", "12/25/2019", "12/26/2019", "12/27/2019", "12/28/2019",
- "12/29/2019", "12/30/2019", "11/01/2019", "11/02/2019", "11/03/2019", "11/04/2019", "11/05/2019",
- "11/06/2019", "11/07/2019", "11/08/2019", "11/09/2019", "11/10/2019", "11/11/2019", "11/12/2019",
- "11/13/2019", "11/14/2019", "11/15/2019", "11/16/2019", "11/17/2019", "11/18/2019", "11/19/2019",
- "11/20/2019", "11/21/2019", "11/22/2019", "11/23/2019", "11/24/2019", "11/25/2019", "11/26/2019",
- "11/27/2019", "11/28/2019", "11/29/2019", "11/30/2019"]
- # add columns
- df['scraped_from'] = name[0]
- df['source_country'] = name[1]
- df['source_niche'] = name[2]
- df['date_scraped'] = random.choice(dates)
- # df['date_scraped'] = today = datetime.datetime.now().strftime("%d/%m/%y")
- # replacing , with ;
- stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
- # stringColumnsLower = [item.lower() for item in keepColumns]
- for column in stringColumns:
- if column.lower().strip() in df.columns.str.lower():
- df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
- print("Replaced , & ; in column " + column)
- # sort columns
- df.sort_index(axis='columns', inplace=True, ascending=False)
- # specify data types
- for type in intListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
- except:
- print("Converting type error " + type)
- for type in floatListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
- except:
- print("Converting type error " + type)
- # df.zipcode.astype(str)
- count_column = df.shape[1]
- print("Total columns atm %d", count_column)
- if count_column != 27:
- print("Columns count should be 27, check if everything is okay!")
- print("Datatypes")
- print(df.dtypes)
- df['email'].replace('', np.nan, inplace=True)
- df.dropna(subset=['email'], inplace=True)
- # save dataframe in a new file
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\\" + name[0] + "_database.csv", index=False,
- encoding="utf-8-sig")
- # remove the old file
- # os.remove(f)
- def printUsernames():
- list = ["adelemaree", "aianational", "albuslumen", "aleciataryn_fitforlife", "alenpalander", "alicemcnails",
- "allblacks7s", "alliemtaylor", "allyfashion", "amber.blaxland", "amysheppardpie", "amy_hetherington",
- "andrewtkearns", "anita_ghise", "anna.davey", "antoniagigovskaa", "antonijalokvenec", "archdaily",
- "archdigest", "architectmag", "ariarne_lepine", "ashleymescia", "asiyami_gold", "aumi", "azaleamodels",
- "azlinnicolette", "baffmasta", "Batch.csv", "beachbunnyswimwear", "bel_sloane", "bjarkeingels",
- "blogbeautyglamfashion", "bondeyeswim", "bondibather", "bonnieandersonmusic", "boys_run_free", "bran.wolf",
- "brandonhardbody", "brandonwoelfel", "breakfastinsydney", "bricharismakeup", "bryanadamc", "bubs2bikinis",
- "cairo_dwek", "calsnape", "cassimanner", "changing_habits", "charlbi143", "chrismelberger", "conagh_kat",
- "cyd_morris", "damon_baker", "danidiamondphotography", "danielinskeep", "dannylomass", "denim.archives",
- "dexthefreak", "dezeen", "dfreske", "dmv.makeupartistry", "duskiiactive", "eivilocs", "ejmaxwell",
- "elghazy_hamza", "eliasriadi", "ellymiles", "eloratahiti", "emileemacc", "emmachenartistry",
- "emmahyndman_theposingpro", "enfantsrichesdeprimes", "entrepreneur", "erinscottbeauty", "esmedewitt",
- "estefaniac2t", "facehalo", "flightfac", "forevernew_official", "fujifilmx_au", "fursty", "gallucks",
- "garyvee", "georgychernyadyev", "gianlucavacchi", "glamgirlamy", "gp.yt", "grenadeofficial", "halotopuk",
- "harvardgsd", "hotmess", "housesaddictive", "icingandglitter", "igkansascity", "imogencaldwell", "ioegreer",
- "irenenoren", "irenerudnykphoto", "itswillcarmack", "jade_packer", "jaivasicek", "jake_of_all_trades",
- "jamescharles", "jannikobenhoff", "janny.organically", "jasmine__clarke", "jelenamarkovic___",
- "jerichoroadclothing", "jessedriftwood", "jessicajane_au", "jessicakobeissi", "jetsswimwear", "johnpawson",
- "jordanpeele", "juvahn.victoria", "kai.boet", "kalula_tattoo", "karimamckimmie", "katrikats", "katysurfs",
- "kaylah.model", "keeganmichaelkey", "kehaulanisanares", "kerrysexton_ifbbpro", "kim_akrich", "kirstycane",
- "kivari_the_label", "kuwaii", "leahjay_", "lex.weinstein", "lildenimjean", "lioninthewild",
- "livelovebeirut", "longbeach_griffy", "lucentement", "lukasabbat", "lukehinesonline", "maanonu",
- "maisonsdumonde_es", "malakaifekitoa"]
- os.chdir(r"C:\Users\Administrator\Desktop\Drive\All_profiles_from_drive\ZaBaza\Output")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- i = 0
- for f in all_filenames:
- if 'database' in f:
- name = f.split("_database")
- else:
- name = f.split('-')
- i = i + 1
- print(name[0])
- print("Total number %d", i)
- def singleChangeCommas():
- df = pd.read_csv(r"C:\Users\Administrator\Desktop\test\emails28SPLITTED.csv", error_bad_lines=False,
- lineterminator='\n', low_memory=False)
- stringColumns = ["cityName"]
- stringColumnsPhone = ["contact_phone_number"]
- for column in stringColumns:
- if column.lower().strip() in df.columns.str.lower():
- # df[column] = '"' + df[column] + '"'
- df[column] = df[column].str.replace("'", '')
- df[column] = df[column].str.replace("[", '')
- df[column] = df[column].str.replace("]", '')
- print("Replaced , & ; in column " + column)
- print("Fixing columns")
- for column in df.columns:
- stripped = column.strip()
- df.rename(columns={column: stripped}, inplace=True)
- df.to_csv(r"C:\Users\Administrator\Desktop\Profiles for database\New folder\Batches\uploadDatabaseNEW.csv",
- encoding="utf-8-sig", index=False)
- def printDtypes():
- df = pd.read_csv(r"C:\Users\Administrator\Desktop\Profiles for database\Site\Output\Third100\Third100.csv",
- error_bad_lines=False, lineterminator='\n', low_memory=False)
- print(df.dtypes)
- print("Fixing column names")
- for column in df.columns:
- stripped = column.strip()
- df.rename(columns={column: stripped}, inplace=True)
- stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
- for column in stringColumns:
- if column.lower().strip() in df.columns.str.lower():
- df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
- print("Replaced , & ; in column " + column)
- df.to_csv(r"C:\Users\Administrator\Desktop\Profiles for database\Site\Output\Third100\Third100_database.csv",
- index=False, encoding="utf-8-sig")
- print(df.dtypes)
- def addScrapedColumn():
- os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- intListTypes = ["follower_count", "following_count", "media_count"]
- floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
- for f in all_filenames:
- name = f.split('_')
- df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
- print("Current file " + name[0])
- df['scraped_from'] = name[0]
- print("Stripping columns from /r/n")
- for column in df.columns:
- stripped = column.strip()
- df.rename(columns={column: stripped}, inplace=True)
- # specify data types
- for type in intListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
- except:
- print("Converting type error " + type)
- for type in floatListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
- except:
- print("Converting type error " + type)
- df.sort_index(axis='columns', inplace=True, ascending=False)
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data\\" + name[0] + "-followers.csv",
- index=False, encoding="utf-8-sig")
- # only emails
- def leaveOnlyEmails():
- os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
- extension = 'csv'
- keepColumns = ["userId", "username", "full_name", "follower_count", "following_count", "media_count", "biography",
- "hasProfilePic",
- "external_url", "email", "contact_phone_number", "address_street", "isbusiness", "Engagement %",
- "AvgLikes", "AvgComments",
- "MostRecentPostDate", "category", "cityName", "businessJoinDate", "businessCountry", "businessAds",
- "countryCode"]
- intListTypes = ["follower_count", "following_count", "media_count"]
- floatListTypes = ["Engagement %", "AvgLikes", "AvgComments"]
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- for f in all_filenames:
- name = f.split('_')
- df = pd.read_csv(f, error_bad_lines=False, lineterminator='\n')
- print("Current file " + name[0])
- print("Stripping columns from /r/n")
- for column in df.columns:
- stripped = column.strip()
- df.rename(columns={column: stripped}, inplace=True)
- # deleting empty and not needed columns
- keepColumnsLower = [item.lower() for item in keepColumns]
- for column in df.columns.str.lower():
- if column.strip() not in keepColumnsLower:
- if 'public_email' in column:
- print("Renaming column " + column)
- df.rename(columns={column: 'email'}, inplace=True)
- else:
- print("Deleting " + column)
- del df[column]
- # adding missing columns for database
- # print("Adding missing columns")
- for column in keepColumns:
- lower = column.lower()
- strip_list = [item.strip() for item in df.columns.str.lower()]
- if lower not in strip_list:
- print("Added column: " + column)
- df[column] = ""
- # replacing , with ;
- stringColumns = ["biography", "category", "full_name", "address_street", "cityName", "external_url", "email"]
- # stringColumnsLower = [item.lower() for item in keepColumns]
- for column in stringColumns:
- if column.lower().strip() in df.columns.str.lower():
- df[column] = df[column].fillna('').astype(str).str.replace(',', ';')
- print("Replaced , & ; in column " + column)
- # sort columns
- df.sort_index(axis='columns', inplace=True, ascending=False)
- # specify data types
- for type in intListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(int)
- except:
- print("Converting type error " + type)
- for type in floatListTypes:
- try:
- df[type] = pd.to_numeric(df[type], errors='coerce').fillna(0).astype(float)
- except:
- print("Converting type error " + type)
- # df.zipcode.astype(str)
- count_column = df.shape[1]
- print("Total columns atm %d", count_column)
- if count_column != 27:
- print("Columns count should be 27, check if everything is okay!")
- print("Datatypes")
- print(df.dtypes)
- df['email'].replace('', np.nan, inplace=True)
- df.dropna(subset=['email'], inplace=True)
- # save dataframe in a new file
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data\\" + name[0] + "_onlyEmails.csv",
- index=False, encoding="utf-8-sig")
- # remove the old file
- # os.remove(f)
- def deleteByKeyword():
- os.chdir(r"C:\Users\Administrator\PycharmProjects\Filters\Basic\Data")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- for f in all_filenames:
- if "onlyEmails" in f:
- print("Keeping " + f)
- else:
- print("Deleting " + f)
- os.remove(f)
- def matchSplitCsv():
- csv.field_size_limit(2000000000)
- # sys.stdout = open("goat.csv", "w")
- mainList = []
- with open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\emails.csv", encoding="utf-8-sig",
- errors='ignore') as csvfile, \
- open(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\splitEmails.csv", 'w+', newline="",
- encoding="utf-8-sig") as outputfile:
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=",")
- writer.writerow(next(readCSV))
- tempList = []
- i = 0
- for row in readCSV:
- if (i < 700):
- tempList.append(row)
- i = i + 1
- else:
- mainList.append(tempList)
- tempList = []
- tempList.append(row)
- i = 1
- if len(tempList) > 0:
- mainList.append(tempList)
- df = pd.DataFrame.from_records(mainList)
- df = df.transpose()
- for column in df.columns:
- df[column] = df[column].astype(str).str.replace("'", '')
- df[column] = df[column].astype(str).str.replace("[", '')
- df[column] = df[column].astype(str).str.replace("]", '')
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\splitEmails.csv", index=False)
- def sortColumns():
- df = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\boutinelastudio_followers.csv",
- error_bad_lines=False, lineterminator='\n', low_memory=False)
- df.sort_index(axis='columns', inplace=True, ascending=False)
- df.to_csv(r"C:\Users\Administrator\PycharmProjects\Filters\Daniel\boutinelastudio_followers_sorted.csv",
- index=False)
- mainFunc()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement