Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Hacking around with this dataset
- # https://github.com/MuseumofModernArt/collection/blob/master/Artworks.csv
- # download a local copy like this
- # wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv
- # place it in the same directory as the script
- import csv
- import io
- import urllib.request
- import pandas as pd
- import string
- import re
- import math
- import os.path
- import numpy
- # you need to pip install wget for this to work
- import wget
- # download a local copy like this
- # wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv
- localfile = "Artworks.csv"
- def gender_code(gender_string):
- if isinstance(gender_string, float) and math.isnan(gender_string):
- return gender_string
- gender_string = re.sub("[()]","", str(gender_string))
- gender_string = gender_string.split(" ")[-1]
- return int(gender_string == "Female")
- def nationality_code(nationality_string):
- if isinstance(nationality_string, float) and math.isnan(nationality_string):
- return nationality_string
- nationality_string = re.sub("[()]","", str(nationality_string))
- nationality_string = nationality_string.split(" ")[-1]
- return nationality_string
- def safe_div(x,y):
- print (x,y)
- # if ( y != 0 ):
- # return x/y
- #else:
- # return (0)
- return 0
- pd.options.display.max_rows = 999
- if not os.path.isfile(localfile):
- print ( "Downloading data to " + localfile)
- wget.download("https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv")
- df = pd.read_csv(localfile, usecols=["Nationality","Gender"])
- df["GenderX"] = df["Gender"].apply(gender_code)
- df["NationalityX"] = df["Nationality"].apply(nationality_code)
- del df['Nationality']
- del df['Gender']
- #for index, row in df.iterrows():
- # print (row)
- df_grouped = df.groupby(["NationalityX","GenderX"])
- print( df_grouped.size() )
- #print(df_grouped.apply(lambda x: x/x.sum() ))
- #print(df_grouped.apply(lambda x: safe_div(x, x.sum()) ))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement