Untitled

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import datetime as dt

# Data
data = pd.read_csv('data/original/listings.csv')
reviews = pd.read_csv('data/cleaned/reviews_cleaned.csv',
                      names=['listing_id', 'comments'])

# Pocet stlpcov pred
# print("Columnt size before:\t", len(data.columns))

# Vyhodime nepotrebne stlpce
data = pd.DataFrame.drop(
    data,
    columns=[
        'host_name',
        'notes',
        'host_about',
        'calendar_updated',
        'host_acceptance_rate',
        'description',
        'thumbnail_url',
        'experiences_offered',
        'listing_url',
        'name',
        'summary',
        'space',
        'scrape_id',
        'last_scraped',
        'neighborhood_overview',
        'transit',
        'access',
        'interaction',
        'house_rules',
        'medium_url',
        'picture_url',
        'xl_picture_url',
        'host_url',
        'host_thumbnail_url',
        'host_picture_url',
        'host_acceptance_rate',
        'smart_location',
        'license',
        'jurisdiction_names',
        'street',
        'neighbourhood',
        'country',
        'country_code',
        'host_location',
        'host_neighbourhood',
        'market',
        'is_location_exact',
        'square_feet',
        'weekly_price',
        'monthly_price',
        'availability_30',
        'availability_60',
        'availability_90',
        'availability_365',
        'calendar_last_scraped',
        'first_review',
        'last_review',
        'requires_license',
        'calculated_host_listings_count',
        'host_listings_count',
        'zipcode'
    ]
)

# print(list(data))

print('Splitting host verifications')
host_verification_set = set()


def collect_host_verifications(entry):
    entry_list = entry    \
        .replace("[", "") \
        .replace("]", "") \
        .replace("'", "") \
        .replace('"', "") \
        .replace(" ", "") \
        .split(',')

    for verification in entry_list:
        if (verification != "" and verification != 'None'):
            host_verification_set.add(verification + "_verification")


# print(data['host_verifications'])

data['host_verifications'].apply(collect_host_verifications)


def generic_verification(entry, v):
    entry_list = str(entry) \
        .replace("[", "")   \
        .replace("]", "")   \
        .replace("'", "")   \
        .replace('"', "")   \
        .replace(" ", "")   \
        .split(',')

    for verification in entry_list:
        if (verification + "_verification" == v):
            return 1

    return 0


for v in host_verification_set:
    data.insert(len(list(data)), v, 0)
    data[v] = data['host_verifications'].apply(
        lambda x: generic_verification(x, v)
    )

data = pd.DataFrame.drop(data, columns=['host_verifications'])


def clean_response_rate(entry):
    if (type(entry) == str):
        return entry.replace('%', '')
    return 0


data['host_response_rate'] = data['host_response_rate'].apply(
    clean_response_rate
)


def clean_superhost(entry):
    if (entry == 't'):
        return 1
    return 0


data['host_is_superhost'] = data['host_is_superhost'].apply(clean_superhost)
data['host_has_profile_pic'] = data['host_has_profile_pic'].apply(clean_superhost)
data['host_identity_verified'] = data['host_identity_verified'].apply(clean_superhost)
data['has_availability'] = data['has_availability'].apply(clean_superhost)
data['instant_bookable'] = data['instant_bookable'].apply(clean_superhost)
data['is_business_travel_ready'] = data['is_business_travel_ready'].apply(clean_superhost)
data['require_guest_profile_picture'] = data['require_guest_profile_picture'].apply(clean_superhost)
data['require_guest_phone_verification'] = data['require_guest_phone_verification'].apply(clean_superhost)

"""
print(list(data))
print(data['host_verifications'][0])
for v in host_verification_set:
    print(v, " ", data[v][0])
"""


def clean_number(entry):
    if (math.isnan(entry)):
        return 0
    return entry


def clean_number_removal(entry):
    if (math.isnan(entry)):
        return 999
    return entry


data['bathrooms'] = data['bathrooms'].apply(clean_number_removal)
data['bedrooms'] = data['bedrooms'].apply(clean_number_removal)
data['beds'] = data['beds'].apply(clean_number_removal)

data = data[data['bathrooms'] != 999]
data = data[data['bedrooms'] != 999]
data = data[data['beds'] != 999]


def reviews_per_month_cleanup(entry):
    if (math.isnan(entry)):
        return 0
    return entry


def clean_price(entry):
    if (type(entry) != str and math.isnan(entry)):
        return 999
    entry1 = entry \
        .replace('$', '') \
        .replace(',', '')

    if (float(entry1) == 0):
        return 999

    return np.log(float(entry1))

data['reviews_per_month'] = data['reviews_per_month'].apply(reviews_per_month_cleanup)
data['price'] = data['price'].apply(clean_price)
data['extra_people'] = data['extra_people'].apply(clean_price)
data['security_deposit'] = data['security_deposit'].apply(clean_price)
data['cleaning_fee'] = data['cleaning_fee'].apply(clean_price)

data = data[data['price'] != 999]
data = data[data['extra_people'] != 999]
data = data[data['security_deposit'] != 999]
data = data[data['cleaning_fee'] != 999]

def clean_listings_count(entry):
    if (math.isnan(entry)):
        return 1
    return entry


data['host_total_listings_count'] = data['host_total_listings_count'].apply(clean_listings_count)

print('Spliting amenities')
amenities_set = set()


def collect_amenities(entry):
    entry_list = entry     \
        .replace("{", "")  \
        .replace("}", "")  \
        .replace("'", "")  \
        .replace('"', "")  \
        .replace(" ", "_") \
        .split(',')

    for am in entry_list:
        if ('translation_missing' not in am and am != ''):
            amenities_set.add(am)

data['amenities'].apply(collect_amenities)
# print(amenities_set)

def generic_amenities(entry, amenity):
    entry_list = entry      \
        .replace("{", "")   \
        .replace("}", "")   \
        .replace("'", "")   \
        .replace('"', "")   \
        .replace(" ", "_")  \
        .split(',')

    for am in entry_list:
        if (am == amenity):
            return 1
    return 0

for amenity in amenities_set:
    data.insert(len(list(data)), amenity, 0)
    data[amenity] = data['amenities'] \
        .apply(lambda x: generic_amenities(x, amenity))

# print(data['amenities'][0])
# for v in  amenities_set:
#    print(v, " ", data[v][0])


# Zahodime nepotrebne, uz pouzite stlpce
data = pd.DataFrame.drop(data, columns=['amenities', 'state'])

for col_name in ['property_type', 'bed_type', 'room_type',
                 'neighbourhood_group_cleansed','city', 'cancellation_policy',
                 'host_response_time', 'neighbourhood_cleansed']:

    parsed_cols = pd.get_dummies(data[col_name])

    # print(data[col_name])
    # print(parsed_cols)

    data = data.drop(columns=[col_name])
    data = pd.concat([data, parsed_cols], axis=1)


def clean_host_since(entry):
    if (type(entry) != str and math.isnan(entry)):
        return 999
    return entry


data['host_since'] = data['host_since'].apply(clean_host_since)
data = data[data['host_since'] != 999]

# Changing the host_since to number with dummy date
dummy_date = dt.datetime(2020, 1, 1)

data.host_since = (dummy_date - pd.to_datetime(data.host_since))
data.host_since = data.host_since.apply(lambda x: float(x.days))

for col_name in ['review_scores_rating', 'review_scores_accuracy',
                 'review_scores_cleanliness', 'review_scores_checkin',
                 'review_scores_communication', 'review_scores_location',
                 'review_scores_value']:

    data[col_name] = data[col_name].apply(lambda x: 0 if np.isnan(x) else x)

# Merge reviews csv
data = data.set_index('id').join(reviews.set_index('listing_id'))


def clean_comments(entry):
    if (type(entry) != str and math.isnan(entry)):
        return 0
    return entry

data['comments'] = data['comments'].apply(clean_comments)

# Pocet stlpcov po
# print("Columnt size after:\t", len(data.columns))

# data.info()
data.to_csv('data/cleaned/data_cleaned.csv')