Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from demyst.analytics import Analytics \n",
- "import pandas as pd\n",
- "\n",
- "analytics = Analytics()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# read in the full file\n",
- "original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Filtering the original file down to an N of 5000. \n",
- "# 1000 chargeoffs and 4000 paid in full \n",
- "\n",
- "# Change this number for a smaller sample\n",
- "N = 5000\n",
- "# Take the more recent loans\n",
- "recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n",
- "# Identify the charge offs and the paid in full\n",
- "recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n",
- "recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n",
- "# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n",
- "recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n",
- "recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n",
- "sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Make binary variable binary numbers\n",
- "sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n",
- "sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Define Source List\n",
- "source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
- " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n",
- " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
- " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Doing a single lookup\n",
- "test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sample.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# renaming existing column names to demyst types where applicable\n",
- "sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n",
- "sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n",
- "sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n",
- "sample.rename(columns = {'BorrState':'state'}, inplace = True)\n",
- "sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n",
- "\n",
- "# add country, since it is common and easy\n",
- "sample['country'] = \"us\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "analytics.validate2(sample)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Follow advice of the validate2 output\n",
- "sample['post_code'] = sample['post_code'].astype(str)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "analytics.validate(sample, providers=source_list)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# as to not waste money when doing screenshots\n",
- "# sample = sample[1:5]\n",
- "sample"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Bing requires a \"query\" column\n",
- "sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Append Attom ID for attom products\n",
- "\n",
- "attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Append latitude and longitude for Walkscore\n",
- "\n",
- "google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n",
- "\n",
- "# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n",
- "google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n",
- "\n",
- "# We are using US Businesses Only\n",
- "google_latlon_inputs['country'] = 'us'\n",
- "\n",
- "google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sample['longitude'] = google_latlon_results['google_latlon.longitude']\n",
- "sample['latitude'] = google_latlon_results['google_latlon.latitude']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Already appended google_latlon, so do not need to do it again.\n",
- "updated_source_list = [\n",
- " 'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
- " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n",
- " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
- " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "results = analytics.enrich_and_download(source_list, sample, validate=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n",
- "attom_avm_results = results.filter(regex='attom_avm', axis=1)\n",
- "attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n",
- "attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n",
- "attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n",
- "\n",
- "attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n",
- "attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n",
- "attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n",
- "attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n",
- "\n",
- "# Rename the columns so we have identical columns from the two data frames\n",
- "attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n",
- "attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n",
- "\n",
- "# Do a combine_first \n",
- "attom_combined = attom_commercial.combine_first(attom_residential)\n",
- "\n",
- "# Remove original attom columns from original results\n",
- "attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n",
- "\n",
- "# Combine attom results with the others (and google latlon from earlier)\n",
- "recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n",
- "no_dups.to_csv(\"data_robot_input.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment