Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import math
- import datetime as dt
- # Data
- data = pd.read_csv('data/original/listings.csv')
- reviews = pd.read_csv('data/cleaned/reviews_cleaned.csv',
- names=['listing_id', 'comments'])
- # Pocet stlpcov pred
- # print("Columnt size before:\t", len(data.columns))
- # Vyhodime nepotrebne stlpce
- data = pd.DataFrame.drop(
- data,
- columns=[
- 'host_name',
- 'notes',
- 'host_about',
- 'calendar_updated',
- 'host_acceptance_rate',
- 'description',
- 'thumbnail_url',
- 'experiences_offered',
- 'listing_url',
- 'name',
- 'summary',
- 'space',
- 'scrape_id',
- 'last_scraped',
- 'neighborhood_overview',
- 'transit',
- 'access',
- 'interaction',
- 'house_rules',
- 'medium_url',
- 'picture_url',
- 'xl_picture_url',
- 'host_url',
- 'host_thumbnail_url',
- 'host_picture_url',
- 'host_acceptance_rate',
- 'smart_location',
- 'license',
- 'jurisdiction_names',
- 'street',
- 'neighbourhood',
- 'country',
- 'country_code',
- 'host_location',
- 'host_neighbourhood',
- 'market',
- 'is_location_exact',
- 'square_feet',
- 'weekly_price',
- 'monthly_price',
- 'availability_30',
- 'availability_60',
- 'availability_90',
- 'availability_365',
- 'calendar_last_scraped',
- 'first_review',
- 'last_review',
- 'requires_license',
- 'calculated_host_listings_count',
- 'host_listings_count',
- 'zipcode'
- ]
- )
- # print(list(data))
- print('Splitting host verifications')
- host_verification_set = set()
- def collect_host_verifications(entry):
- entry_list = entry \
- .replace("[", "") \
- .replace("]", "") \
- .replace("'", "") \
- .replace('"', "") \
- .replace(" ", "") \
- .split(',')
- for verification in entry_list:
- if (verification != "" and verification != 'None'):
- host_verification_set.add(verification + "_verification")
- # print(data['host_verifications'])
- data['host_verifications'].apply(collect_host_verifications)
- def generic_verification(entry, v):
- entry_list = str(entry) \
- .replace("[", "") \
- .replace("]", "") \
- .replace("'", "") \
- .replace('"', "") \
- .replace(" ", "") \
- .split(',')
- for verification in entry_list:
- if (verification + "_verification" == v):
- return 1
- return 0
- for v in host_verification_set:
- data.insert(len(list(data)), v, 0)
- data[v] = data['host_verifications'].apply(
- lambda x: generic_verification(x, v)
- )
- data = pd.DataFrame.drop(data, columns=['host_verifications'])
- def clean_response_rate(entry):
- if (type(entry) == str):
- return entry.replace('%', '')
- return 0
- data['host_response_rate'] = data['host_response_rate'].apply(
- clean_response_rate
- )
- def clean_superhost(entry):
- if (entry == 't'):
- return 1
- return 0
- data['host_is_superhost'] = data['host_is_superhost'].apply(clean_superhost)
- data['host_has_profile_pic'] = data['host_has_profile_pic'].apply(clean_superhost)
- data['host_identity_verified'] = data['host_identity_verified'].apply(clean_superhost)
- data['has_availability'] = data['has_availability'].apply(clean_superhost)
- data['instant_bookable'] = data['instant_bookable'].apply(clean_superhost)
- data['is_business_travel_ready'] = data['is_business_travel_ready'].apply(clean_superhost)
- data['require_guest_profile_picture'] = data['require_guest_profile_picture'].apply(clean_superhost)
- data['require_guest_phone_verification'] = data['require_guest_phone_verification'].apply(clean_superhost)
- """
- print(list(data))
- print(data['host_verifications'][0])
- for v in host_verification_set:
- print(v, " ", data[v][0])
- """
- def clean_number(entry):
- if (math.isnan(entry)):
- return 0
- return entry
- def clean_number_removal(entry):
- if (math.isnan(entry)):
- return 999
- return entry
- data['bathrooms'] = data['bathrooms'].apply(clean_number_removal)
- data['bedrooms'] = data['bedrooms'].apply(clean_number_removal)
- data['beds'] = data['beds'].apply(clean_number_removal)
- data = data[data['bathrooms'] != 999]
- data = data[data['bedrooms'] != 999]
- data = data[data['beds'] != 999]
- def reviews_per_month_cleanup(entry):
- if (math.isnan(entry)):
- return 0
- return entry
- def clean_price(entry):
- if (type(entry) != str and math.isnan(entry)):
- return 999
- entry1 = entry \
- .replace('$', '') \
- .replace(',', '')
- if (float(entry1) == 0):
- return 999
- return np.log(float(entry1))
- data['reviews_per_month'] = data['reviews_per_month'].apply(reviews_per_month_cleanup)
- data['price'] = data['price'].apply(clean_price)
- data['extra_people'] = data['extra_people'].apply(clean_price)
- data['security_deposit'] = data['security_deposit'].apply(clean_price)
- data['cleaning_fee'] = data['cleaning_fee'].apply(clean_price)
- data = data[data['price'] != 999]
- data = data[data['extra_people'] != 999]
- data = data[data['security_deposit'] != 999]
- data = data[data['cleaning_fee'] != 999]
- def clean_listings_count(entry):
- if (math.isnan(entry)):
- return 1
- return entry
- data['host_total_listings_count'] = data['host_total_listings_count'].apply(clean_listings_count)
- print('Spliting amenities')
- amenities_set = set()
- def collect_amenities(entry):
- entry_list = entry \
- .replace("{", "") \
- .replace("}", "") \
- .replace("'", "") \
- .replace('"', "") \
- .replace(" ", "_") \
- .split(',')
- for am in entry_list:
- if ('translation_missing' not in am and am != ''):
- amenities_set.add(am)
- data['amenities'].apply(collect_amenities)
- # print(amenities_set)
- def generic_amenities(entry, amenity):
- entry_list = entry \
- .replace("{", "") \
- .replace("}", "") \
- .replace("'", "") \
- .replace('"', "") \
- .replace(" ", "_") \
- .split(',')
- for am in entry_list:
- if (am == amenity):
- return 1
- return 0
- for amenity in amenities_set:
- data.insert(len(list(data)), amenity, 0)
- data[amenity] = data['amenities'] \
- .apply(lambda x: generic_amenities(x, amenity))
- # print(data['amenities'][0])
- # for v in amenities_set:
- # print(v, " ", data[v][0])
- # Zahodime nepotrebne, uz pouzite stlpce
- data = pd.DataFrame.drop(data, columns=['amenities', 'state'])
- for col_name in ['property_type', 'bed_type', 'room_type',
- 'neighbourhood_group_cleansed','city', 'cancellation_policy',
- 'host_response_time', 'neighbourhood_cleansed']:
- parsed_cols = pd.get_dummies(data[col_name])
- # print(data[col_name])
- # print(parsed_cols)
- data = data.drop(columns=[col_name])
- data = pd.concat([data, parsed_cols], axis=1)
- def clean_host_since(entry):
- if (type(entry) != str and math.isnan(entry)):
- return 999
- return entry
- data['host_since'] = data['host_since'].apply(clean_host_since)
- data = data[data['host_since'] != 999]
- # Changing the host_since to number with dummy date
- dummy_date = dt.datetime(2020, 1, 1)
- data.host_since = (dummy_date - pd.to_datetime(data.host_since))
- data.host_since = data.host_since.apply(lambda x: float(x.days))
- for col_name in ['review_scores_rating', 'review_scores_accuracy',
- 'review_scores_cleanliness', 'review_scores_checkin',
- 'review_scores_communication', 'review_scores_location',
- 'review_scores_value']:
- data[col_name] = data[col_name].apply(lambda x: 0 if np.isnan(x) else x)
- # Merge reviews csv
- data = data.set_index('id').join(reviews.set_index('listing_id'))
- def clean_comments(entry):
- if (type(entry) != str and math.isnan(entry)):
- return 0
- return entry
- data['comments'] = data['comments'].apply(clean_comments)
- # Pocet stlpcov po
- # print("Columnt size after:\t", len(data.columns))
- # data.info()
- data.to_csv('data/cleaned/data_cleaned.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement