Advertisement
Guest User

Untitled

a guest
Jun 29th, 2017
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.04 KB | None | 0 0
  1.  
  2. # Hacking around with this dataset
  3. # https://github.com/MuseumofModernArt/collection/blob/master/Artworks.csv
  4. # download a local copy like this
  5. # wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv
  6. # place it in the same directory as the script
  7.  
  8. import csv
  9. import io
  10. import urllib.request
  11. import pandas as pd
  12. import string
  13. import re
  14. import math
  15. import os.path
  16. import numpy
  17. # you need to pip install wget for this to work
  18. import wget
  19.  
  20.  
  21. # download a local copy like this
  22. # wget https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv
  23.  
  24. localfile = "Artworks.csv"
  25.  
  26. def gender_code(gender_string):
  27.     if isinstance(gender_string, float) and math.isnan(gender_string):
  28.         return gender_string
  29.     gender_string = re.sub("[()]","", str(gender_string))
  30.     gender_string = gender_string.split(" ")[-1]
  31.     return int(gender_string == "Female")
  32.  
  33. def nationality_code(nationality_string):
  34.     if isinstance(nationality_string, float) and math.isnan(nationality_string):
  35.         return nationality_string
  36.     nationality_string = re.sub("[()]","", str(nationality_string))
  37.     nationality_string = nationality_string.split(" ")[-1]
  38.     return nationality_string
  39.  
  40. def safe_div(x,y):
  41.     print (x,y)
  42.     # if ( y != 0 ):
  43.     #   return x/y
  44.     #else:
  45.     #   return (0)
  46.     return 0
  47.  
  48. pd.options.display.max_rows = 999
  49.  
  50. if not os.path.isfile(localfile):
  51.     print ( "Downloading data to " + localfile)
  52.     wget.download("https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv")
  53.  
  54. df = pd.read_csv(localfile, usecols=["Nationality","Gender"])
  55.  
  56. df["GenderX"] = df["Gender"].apply(gender_code)
  57. df["NationalityX"] = df["Nationality"].apply(nationality_code)
  58.  
  59. del df['Nationality']
  60. del df['Gender']
  61.  
  62. #for index, row in df.iterrows():
  63. #   print (row)
  64.  
  65. df_grouped = df.groupby(["NationalityX","GenderX"])
  66. print( df_grouped.size() )
  67. #print(df_grouped.apply(lambda x: x/x.sum() ))
  68. #print(df_grouped.apply(lambda x: safe_div(x, x.sum()) ))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement