Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import os\n",
- "from sklearn.base import BaseEstimator, TransformerMixin\n",
- "from sklearn.pipeline import FeatureUnion\n",
- "from sklearn.linear_model.logistic import LogisticRegression\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "sns.set_style('whitegrid')\n",
- "%matplotlib inline\n",
- "import sklearn\n",
- "from sklearn.linear_model import LinearRegression\n",
- "from sklearn import ensemble, preprocessing, cross_validation\n",
- "from sklearn.metrics import roc_auc_score as auc\n",
- "from time import time\n",
- "from sklearn import linear_model\n",
- "from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC\n",
- "from sklearn import datasets\n",
- "from sklearn.linear_model import SGDClassifier\n",
- "from sklearn.linear_model import ElasticNet\n",
- "from sklearn.preprocessing import PolynomialFeatures\n",
- "from sklearn import svm\n",
- "from sklearn.cross_validation import cross_val_score\n",
- "from sklearn.datasets import make_blobs\n",
- "from sklearn.metrics import mean_squared_error\n",
- "from sklearn.datasets import make_friedman1\n",
- "from sklearn.ensemble import GradientBoostingRegressor\n",
- "from sklearn import gaussian_process\n",
- "from sklearn.linear_model import SGDClassifier\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.ensemble import ExtraTreesClassifier\n",
- "from sklearn.tree import DecisionTreeClassifier\n",
- "from sklearn.ensemble import AdaBoostClassifier\n",
- "from sklearn import neighbors, datasets\n",
- "from sklearn import tree\n",
- "from sklearn.datasets import make_hastie_10_2\n",
- "from sklearn.ensemble import GradientBoostingClassifier"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "script_path = os.path.abspath(os.path.dirname(\"__file__\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "class Get_Price_Rate(BaseEstimator, TransformerMixin):\n",
- " '''\n",
- " get price rate\n",
- " '''\n",
- "\n",
- " def get_feature_names(self):\n",
- "\n",
- " return [self.__class__.__name__]\n",
- "\n",
- " def fit(self, date_frame, y=None):\n",
- " '''\n",
- " fit\n",
- "\n",
- " :param pandas.DataFrame: all data\n",
- " :rtype: Get_Price_Rate\n",
- " '''\n",
- "\n",
- " return self\n",
- "\n",
- " def transform(self, date_frame):\n",
- " '''\n",
- " transform\n",
- "\n",
- " :param pandas.DataFrame: all data\n",
- " :rtype: array\n",
- " '''\n",
- "\n",
- " return date_frame[\"PRICE_RATE\"].as_matrix()[None].T.astype(np.float)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "class Get_Match_Pref(BaseEstimator, TransformerMixin):\n",
- " '''\n",
- " get user pref is match coupon area\n",
- " '''\n",
- "\n",
- " def get_feature_names(self):\n",
- "\n",
- " return [self.__class__.__name__]\n",
- "\n",
- " def fit(self, date_frame, y=None):\n",
- " '''\n",
- " fit\n",
- "\n",
- " :param pandas.DataFrame: all data\n",
- " :rtype: Get_Price_Rate\n",
- " '''\n",
- "\n",
- " return self\n",
- "\n",
- " def transform(self, date_frame):\n",
- " '''\n",
- " transform\n",
- "\n",
- " :param pandas.DataFrame: all data\n",
- " :rtype: array\n",
- " '''\n",
- " res_sr = date_frame[\"PREF_NAME\"] == date_frame[\"ken_name\"]\n",
- "\n",
- " return res_sr.as_matrix()[None].T.astype(np.float)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def top_merge(df, n=10, column=\"predict\", merge_column=\"COUPON_ID_hash\"):\n",
- " '''\n",
- " get top n row\n",
- "\n",
- " :param pandas.DataFrame df:\n",
- " :param int n:\n",
- " :param str column:\n",
- " :rtype: pandas.DataFrame\n",
- " '''\n",
- "\n",
- " return \" \".join(df.sort_index(by=column)[-n:][merge_column])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "feature_list = [\n",
- " ('PRICE_RATE', Get_Price_Rate()),\n",
- " ('MATCH_PREF', Get_Match_Pref()),\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "if __name__ == '__main__':\n",
- " # import csv\n",
- " user_df = pd.read_csv(\"C:\\\\Users\\\\Vikrant\\\\Coupon\\\\user_list.csv\")\n",
- " train_coupon_df = pd.read_csv(\"C:\\\\Users\\\\Vikrant\\\\Coupon\\\\coupon_list_train.csv\")\n",
- " train_visit_df = pd.read_csv(\"C:\\\\Users\\\\Vikrant\\\\Coupon\\\\coupon_visit_train.csv\")\n",
- " test_coupon_df = pd.read_csv(\"C:\\\\Users\\\\Vikrant\\\\Coupon\\\\coupon_list_test.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# create train_df\n",
- "train_df = pd.merge(train_visit_df, train_coupon_df,\n",
- " left_on=\"VIEW_COUPON_ID_hash\", right_on=\"COUPON_ID_hash\")\n",
- "train_df = pd.merge(train_df, user_df,\n",
- " left_on=\"USER_ID_hash\", right_on=\"USER_ID_hash\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- " # create train feature\n",
- " fu_obj = FeatureUnion(transformer_list=feature_list)\n",
- " X_train = fu_obj.fit_transform(train_df)\n",
- " y_train = train_df[\"PURCHASE_FLG\"]\n",
- " assert X_train.shape[0] == y_train.size"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',\n",
- " max_depth=3, max_features=None, max_leaf_nodes=None,\n",
- " min_samples_leaf=1, min_samples_split=2,\n",
- " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
- " random_state=50, subsample=1.0, verbose=0, warm_start=False)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- " # fit model\n",
- " #clf = LogisticRegression()\n",
- " #clf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)\n",
- " #clf = ensemble.RandomForestClassifier(n_jobs=500, n_estimators = 1000, random_state = 15)\n",
- " #SVC\n",
- " #clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
- " #clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
- " #Stochastic Gradient Descent\n",
- " #clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
- " #clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)\n",
- " #clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
- " #RandomForestClassifier - 0.75985\n",
- " #clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=50)\n",
- " #clf = RandomForestClassifier(n_estimators=400, max_depth=20, min_samples_split=1, random_state=200)\n",
- " #ExtraTreesClassifier - 0.76\n",
- " #clf = ExtraTreesClassifier(n_estimators=150, max_depth=20, min_samples_split=2, random_state=100)\n",
- " #clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
- " #Nearest Neighbors Classifier\n",
- " #clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
- " #clf = neighbors.KNeighborsClassifier()\n",
- " #Decision Tree Classifier\n",
- " #clf = tree.DecisionTreeClassifier()\n",
- " #Adaboost\n",
- " #clf = AdaBoostClassifier(n_estimators=100)\n",
- " #GradientBoostingClassifier# - 0.76858\n",
- " #clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0, max_depth=3, random_state=200)\n",
- " clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=50)\n",
- " clf.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- " # create test_df\n",
- " test_coupon_df[\"cross\"] = 1\n",
- " user_df[\"cross\"] = 1\n",
- " test_df = pd.merge(test_coupon_df, user_df, on=\"cross\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- " # create test Feature\n",
- " X_test = fu_obj.transform(test_df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- " # predict test data\n",
- " predict_proba = clf.predict_proba(X_test)\n",
- " pos_idx = np.where(clf.classes_ == True)[0][0]\n",
- " test_df[\"predict\"] = predict_proba[:, pos_idx]\n",
- " top10_coupon = test_df.groupby(\"USER_ID_hash\").apply(top_merge)\n",
- " top10_coupon.name = \"PURCHASED_COUPONS\"\n",
- " top10_coupon.to_csv(\"submission.csv\", header=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement