Advertisement
Guest User

Untitled

a guest
Nov 14th, 2019
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.37 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. import pandas as pd
  4. import math
  5. import datetime as dt
  6.  
  7. # Data
  8. data = pd.read_csv('data/original/listings.csv')
  9. reviews = pd.read_csv('data/cleaned/reviews_cleaned.csv',
  10.                       names=['listing_id', 'comments'])
  11.  
  12. # Pocet stlpcov pred
  13. # print("Columnt size before:\t", len(data.columns))
  14.  
  15. # Vyhodime nepotrebne stlpce
  16. data = pd.DataFrame.drop(
  17.     data,
  18.     columns=[
  19.         'host_name',
  20.         'notes',
  21.         'host_about',
  22.         'calendar_updated',
  23.         'host_acceptance_rate',
  24.         'description',
  25.         'thumbnail_url',
  26.         'experiences_offered',
  27.         'listing_url',
  28.         'name',
  29.         'summary',
  30.         'space',
  31.         'scrape_id',
  32.         'last_scraped',
  33.         'neighborhood_overview',
  34.         'transit',
  35.         'access',
  36.         'interaction',
  37.         'house_rules',
  38.         'medium_url',
  39.         'picture_url',
  40.         'xl_picture_url',
  41.         'host_url',
  42.         'host_thumbnail_url',
  43.         'host_picture_url',
  44.         'host_acceptance_rate',
  45.         'smart_location',
  46.         'license',
  47.         'jurisdiction_names',
  48.         'street',
  49.         'neighbourhood',
  50.         'country',
  51.         'country_code',
  52.         'host_location',
  53.         'host_neighbourhood',
  54.         'market',
  55.         'is_location_exact',
  56.         'square_feet',
  57.         'weekly_price',
  58.         'monthly_price',
  59.         'availability_30',
  60.         'availability_60',
  61.         'availability_90',
  62.         'availability_365',
  63.         'calendar_last_scraped',
  64.         'first_review',
  65.         'last_review',
  66.         'requires_license',
  67.         'calculated_host_listings_count',
  68.         'host_listings_count',
  69.         'zipcode'
  70.     ]
  71. )
  72.  
  73. # print(list(data))
  74.  
  75. print('Splitting host verifications')
  76. host_verification_set = set()
  77.  
  78.  
  79. def collect_host_verifications(entry):
  80.     entry_list = entry    \
  81.         .replace("[", "") \
  82.         .replace("]", "") \
  83.         .replace("'", "") \
  84.         .replace('"', "") \
  85.         .replace(" ", "") \
  86.         .split(',')
  87.  
  88.     for verification in entry_list:
  89.         if (verification != "" and verification != 'None'):
  90.             host_verification_set.add(verification + "_verification")
  91.  
  92.  
  93. # print(data['host_verifications'])
  94.  
  95. data['host_verifications'].apply(collect_host_verifications)
  96.  
  97.  
  98. def generic_verification(entry, v):
  99.     entry_list = str(entry) \
  100.         .replace("[", "")   \
  101.         .replace("]", "")   \
  102.         .replace("'", "")   \
  103.         .replace('"', "")   \
  104.         .replace(" ", "")   \
  105.         .split(',')
  106.  
  107.     for verification in entry_list:
  108.         if (verification + "_verification" == v):
  109.             return 1
  110.  
  111.     return 0
  112.  
  113.  
  114. for v in host_verification_set:
  115.     data.insert(len(list(data)), v, 0)
  116.     data[v] = data['host_verifications'].apply(
  117.         lambda x: generic_verification(x, v)
  118.     )
  119.  
  120. data = pd.DataFrame.drop(data, columns=['host_verifications'])
  121.  
  122.  
  123. def clean_response_rate(entry):
  124.     if (type(entry) == str):
  125.         return entry.replace('%', '')
  126.     return 0
  127.  
  128.  
  129. data['host_response_rate'] = data['host_response_rate'].apply(
  130.     clean_response_rate
  131. )
  132.  
  133.  
  134. def clean_superhost(entry):
  135.     if (entry == 't'):
  136.         return 1
  137.     return 0
  138.  
  139.  
  140. data['host_is_superhost'] = data['host_is_superhost'].apply(clean_superhost)
  141. data['host_has_profile_pic'] = data['host_has_profile_pic'].apply(clean_superhost)
  142. data['host_identity_verified'] = data['host_identity_verified'].apply(clean_superhost)
  143. data['has_availability'] = data['has_availability'].apply(clean_superhost)
  144. data['instant_bookable'] = data['instant_bookable'].apply(clean_superhost)
  145. data['is_business_travel_ready'] = data['is_business_travel_ready'].apply(clean_superhost)
  146. data['require_guest_profile_picture'] = data['require_guest_profile_picture'].apply(clean_superhost)
  147. data['require_guest_phone_verification'] = data['require_guest_phone_verification'].apply(clean_superhost)
  148.  
  149. """
  150. print(list(data))
  151. print(data['host_verifications'][0])
  152. for v in host_verification_set:
  153.    print(v, " ", data[v][0])
  154. """
  155.  
  156.  
  157. def clean_number(entry):
  158.     if (math.isnan(entry)):
  159.         return 0
  160.     return entry
  161.  
  162.  
  163. def clean_number_removal(entry):
  164.     if (math.isnan(entry)):
  165.         return 999
  166.     return entry
  167.  
  168.  
  169. data['bathrooms'] = data['bathrooms'].apply(clean_number_removal)
  170. data['bedrooms'] = data['bedrooms'].apply(clean_number_removal)
  171. data['beds'] = data['beds'].apply(clean_number_removal)
  172.  
  173. data = data[data['bathrooms'] != 999]
  174. data = data[data['bedrooms'] != 999]
  175. data = data[data['beds'] != 999]
  176.  
  177.  
  178. def reviews_per_month_cleanup(entry):
  179.     if (math.isnan(entry)):
  180.         return 0
  181.     return entry
  182.  
  183.  
  184. def clean_price(entry):
  185.     if (type(entry) != str and math.isnan(entry)):
  186.         return 999
  187.     entry1 = entry \
  188.         .replace('$', '') \
  189.         .replace(',', '')
  190.  
  191.     if (float(entry1) == 0):
  192.         return 999
  193.        
  194.     return np.log(float(entry1))
  195.  
  196. data['reviews_per_month'] = data['reviews_per_month'].apply(reviews_per_month_cleanup)
  197. data['price'] = data['price'].apply(clean_price)
  198. data['extra_people'] = data['extra_people'].apply(clean_price)
  199. data['security_deposit'] = data['security_deposit'].apply(clean_price)
  200. data['cleaning_fee'] = data['cleaning_fee'].apply(clean_price)
  201.  
  202. data = data[data['price'] != 999]
  203. data = data[data['extra_people'] != 999]
  204. data = data[data['security_deposit'] != 999]
  205. data = data[data['cleaning_fee'] != 999]
  206.  
  207. def clean_listings_count(entry):
  208.     if (math.isnan(entry)):
  209.         return 1
  210.     return entry
  211.  
  212.  
  213. data['host_total_listings_count'] = data['host_total_listings_count'].apply(clean_listings_count)
  214.  
  215. print('Spliting amenities')
  216. amenities_set = set()
  217.  
  218.  
  219. def collect_amenities(entry):
  220.     entry_list = entry     \
  221.         .replace("{", "")  \
  222.         .replace("}", "")  \
  223.         .replace("'", "")  \
  224.         .replace('"', "")  \
  225.         .replace(" ", "_") \
  226.         .split(',')
  227.  
  228.     for am in entry_list:
  229.         if ('translation_missing' not in am and am != ''):
  230.             amenities_set.add(am)
  231.  
  232. data['amenities'].apply(collect_amenities)
  233. # print(amenities_set)
  234.  
  235. def generic_amenities(entry, amenity):
  236.     entry_list = entry      \
  237.         .replace("{", "")   \
  238.         .replace("}", "")   \
  239.         .replace("'", "")   \
  240.         .replace('"', "")   \
  241.         .replace(" ", "_")  \
  242.         .split(',')
  243.  
  244.     for am in entry_list:
  245.         if (am == amenity):
  246.             return 1
  247.     return 0
  248.  
  249. for amenity in amenities_set:
  250.     data.insert(len(list(data)), amenity, 0)
  251.     data[amenity] = data['amenities'] \
  252.         .apply(lambda x: generic_amenities(x, amenity))
  253.  
  254. # print(data['amenities'][0])
  255. # for v in  amenities_set:
  256. #    print(v, " ", data[v][0])
  257.  
  258.  
  259. # Zahodime nepotrebne, uz pouzite stlpce
  260. data = pd.DataFrame.drop(data, columns=['amenities', 'state'])
  261.  
  262. for col_name in ['property_type', 'bed_type', 'room_type',
  263.                  'neighbourhood_group_cleansed','city', 'cancellation_policy',
  264.                  'host_response_time', 'neighbourhood_cleansed']:
  265.  
  266.     parsed_cols = pd.get_dummies(data[col_name])
  267.  
  268.     # print(data[col_name])
  269.     # print(parsed_cols)
  270.  
  271.     data = data.drop(columns=[col_name])
  272.     data = pd.concat([data, parsed_cols], axis=1)
  273.  
  274.  
  275. def clean_host_since(entry):
  276.     if (type(entry) != str and math.isnan(entry)):
  277.         return 999
  278.     return entry
  279.  
  280.  
  281. data['host_since'] = data['host_since'].apply(clean_host_since)
  282. data = data[data['host_since'] != 999]
  283.  
  284. # Changing the host_since to number with dummy date
  285. dummy_date = dt.datetime(2020, 1, 1)
  286.  
  287. data.host_since = (dummy_date - pd.to_datetime(data.host_since))
  288. data.host_since = data.host_since.apply(lambda x: float(x.days))
  289.  
  290. for col_name in ['review_scores_rating', 'review_scores_accuracy',
  291.                  'review_scores_cleanliness', 'review_scores_checkin',
  292.                  'review_scores_communication', 'review_scores_location',
  293.                  'review_scores_value']:
  294.  
  295.     data[col_name] = data[col_name].apply(lambda x: 0 if np.isnan(x) else x)
  296.  
  297. # Merge reviews csv
  298. data = data.set_index('id').join(reviews.set_index('listing_id'))
  299.  
  300.  
  301. def clean_comments(entry):
  302.     if (type(entry) != str and math.isnan(entry)):
  303.         return 0
  304.     return entry
  305.  
  306. data['comments'] = data['comments'].apply(clean_comments)
  307.  
  308. # Pocet stlpcov po
  309. # print("Columnt size after:\t", len(data.columns))
  310.  
  311. # data.info()
  312. data.to_csv('data/cleaned/data_cleaned.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement