Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from more_itertools import unique_everseen
- import requests, json
- import csv
- import os
- import glob
- import pandas as pd
- def isMale(name):
- with open('male_names1.csv','r') as maleNames:
- checkList = csv.reader(maleNames, delimiter=',')
- for row in checkList:
- if row[0].lower() == name.lower():
- return 1
- return 0
- def excludeBio(line):
- excludedWords = ["specialty", "crafted", "supplies", "domestic", "website", "order", "orders ", "$", "£", "free",
- "world", "worldwide", "wide", "ship", "shipping", "jewelry", "men", "men's", "guy", "guy's",
- "business", "days", "return", "returns", "exchange", "exchanges"]
- line = line.replace('\n', '')
- line = line.replace('\t', '')
- lineList = line.split(" ")
- valsLower = [item.lower() for item in lineList]
- l3 = [x for x in valsLower if x not in excludedWords]
- if len(valsLower) != len(l3):
- print(line)
- return 1
- else:
- return 0
- def cleanWithFilters(row):
- excludeCategoryList = ["Non-Profits & Religious Organizations", "Content & Apps", "Local Business",
- "Transportation & Accomodation Services", "Grocery & Convenience Stores", "Auto Dealers",
- "Business & Utility Services", "Product/Service", "Photographer", "Movie Theater"]
- isOK = 1
- try:
- if int(row[5]) > 3000:
- print("P1")
- isOK = 0
- if int(row[4]) < 30:
- print("P2")
- isOK = 0
- if excludeBio(row[7]):
- print("P3")
- if row[18] in excludeCategoryList:
- print("P4")
- isOK = 0
- if int(row[6]) < 1:
- print("P5")
- isOK = 0
- if not row[10]:
- print("P6")
- isOK = 0
- if isMale(row[2].split(' ')[0]):
- print(row[2].split(' ')[0])
- isOK = 0
- if checkEmailPattern(row[10]):
- print("P7")
- isOK = 0
- except:
- print("except")
- isOK = 0
- return isOK
- def checkEmailPattern(email):
- listOfUnwantedEmails = ["hi","info","aloha","service","customerservice","hello", "help", "photo", "photographer", "shop",
- "customer","contact"]
- emailBegins = email.split('@')[0]
- if emailBegins.lower() in listOfUnwantedEmails:
- return 1
- else:
- return 0
- def matchEmails(email):
- with open('mejlovizaproverka.csv','r') as validEmailList:
- emailList = csv.reader(validEmailList, delimiter=',')
- for row in emailList:
- emailContains = 0
- if row[0].lower() == email.lower():
- emailContains = 1
- break
- if emailContains == 1:
- return 1
- else:
- return 0
- #call when nothing is done
- def mainFunc():
- with open('amy_hetherington_followers_PR6440_1878530020.csv') as csvfile, open('amy_hetherington.csv','w+') as outputfile:
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=",")
- writer.writerow(next(readCSV))
- for row in readCSV:
- if cleanWithFilters(row):
- writer.writerow(row)
- #call after the output file is done
- def mergeToMainCsv():
- os.chdir("/Users/nikolasokolov/PycharmProjects/untitled/allbatches")
- extension = 'csv'
- all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
- # combine all files in the list
- combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
- # export to csv
- combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig')
- #remove duplicates from all previous batches compared to this one
- def removeDuplicates():
- data = pd.read_csv("/Users/nikolasokolov/PycharmProjects/untitled/allbatches/combined_csv.csv")
- df = pd.DataFrame(data)
- emailList = df['email'].unique().tolist()
- with open('50k.csv') as csvfile, open('50k - FINAL.csv','w+') as outputfile:
- readCSV = csv.reader(csvfile, delimiter=',')
- writer = csv.writer(outputfile, delimiter=",")
- writer.writerow(next(readCSV))
- for row in readCSV:
- if row[10] in emailList:
- print("Duplicate " + row[10])
- continue
- writer.writerow(row)
- def addScrapedColumn():
- df = pd.read_csv("stanzie.star_followers_FT433_794983088.csv", error_bad_lines=False)
- df['scrapedFrom'] = "stanzie.star"
- print(df)
- df.to_csv('stanzie.star_followers_FT433_794983088.csv')
- #redot na vikanje funkcii e:
- #mainFunc() kaj shto ke go smenish file-nameot so toj shto treba da se proveri
- #od kako ke gi spoish u sheets i trgnesh duplicates od spoenite
- #downloadiraj go batchot shto e spremen i staj go u allbatches
- #povikaj removeDuplicates za batchot
- #validiraj i prakjaj
- mainFunc()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement