Untitled


# Hacking around with this dataset
# https://github.com/MuseumofModernArt/collection/blob/master/Artworks.csv
# download a local copy like this
# wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv
# place it in the same directory as the script

import csv
import io
import urllib.request
import pandas as pd
import string
import re
import math
import os.path
import numpy
# you need to pip install wget for this to work
import wget


# download a local copy like this
# wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv

localfile = "Artworks.csv"

def gender_code(gender_string):
    if isinstance(gender_string, float) and math.isnan(gender_string):
        return gender_string
    gender_string = re.sub("[()]","", str(gender_string))
    gender_string = gender_string.split(" ")[-1]
    return int(gender_string == "Female")

def nationality_code(nationality_string):
    if isinstance(nationality_string, float) and math.isnan(nationality_string):
        return nationality_string
    nationality_string = re.sub("[()]","", str(nationality_string))
    nationality_string = nationality_string.split(" ")[-1]
    return nationality_string

def safe_div(x,y):
    print (x,y)
    # if ( y != 0 ):
    #   return x/y
    #else:
    #   return (0)
    return 0

pd.options.display.max_rows = 999

if not os.path.isfile(localfile):
    print ( "Downloading data to " + localfile)
    wget.download("https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv")

df = pd.read_csv(localfile, usecols=["Nationality","Gender"])

df["GenderX"] = df["Gender"].apply(gender_code)
df["NationalityX"] = df["Nationality"].apply(nationality_code)

del df['Nationality']
del df['Gender']

#for index, row in df.iterrows():
#   print (row)

df_grouped = df.groupby(["NationalityX","GenderX"])
print( df_grouped.size() )
#print(df_grouped.apply(lambda x: x/x.sum() ))
#print(df_grouped.apply(lambda x: safe_div(x, x.sum()) ))