Guest User

Untitled

a guest
Jan 20th, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.24 KB | None | 0 0
  1. import numpy as np
  2. from sklearn.model_selection import train_test_split
  3. import xgboost as xgb
  4. import cudf
  5. from cudf.dataframe import DataFrame
  6. from collections import OrderedDict
  7. import gc
  8. from glob import glob
  9. import os
  10. import pyblazing
  11. import pandas as pd
  12. import time
  13. from chronometer import Chronometer
  14.  
  15. from pyblazing import FileSystemType, SchemaFrom, DriverType
  16.  
  17. def register_hdfs():
  18. print('*** Register a HDFS File System ***')
  19. fs_status = pyblazing.register_file_system(
  20. authority="myLocalHdfs",
  21. type=FileSystemType.HDFS,
  22. root="/",
  23. params={
  24. "host": "127.0.0.1",
  25. "port": 54310,
  26. "user": "hadoop",
  27. "driverType": DriverType.LIBHDFS3,
  28. "kerberosTicket": ""
  29. }
  30. )
  31. print(fs_status)
  32.  
  33.  
  34. def deregister_hdfs():
  35. fs_status = pyblazing.deregister_file_system(authority="myLocalHdfs")
  36. print(fs_status)
  37.  
  38. def register_posix():
  39.  
  40. import os
  41. dir_path = os.path.dirname(os.path.realpath(__file__))
  42.  
  43. print('*** Register a POSIX File System ***')
  44. fs_status = pyblazing.register_file_system(
  45. authority="mortgage",
  46. type=FileSystemType.POSIX,
  47. root=dir_path
  48. )
  49. print(fs_status)
  50.  
  51. def deregister_posix():
  52. fs_status = pyblazing.deregister_file_system(authority="mortgage")
  53. print(fs_status)
  54.  
  55. from libgdf_cffi import ffi, libgdf
  56.  
  57. def get_dtype_values(dtypes):
  58. values = []
  59. def gdf_type(type_name):
  60. dicc = {
  61. 'str': libgdf.GDF_STRING,
  62. 'date': libgdf.GDF_DATE64,
  63. 'date64': libgdf.GDF_DATE64,
  64. 'date32': libgdf.GDF_DATE32,
  65. 'timestamp': libgdf.GDF_TIMESTAMP,
  66. 'category': libgdf.GDF_CATEGORY,
  67. 'float': libgdf.GDF_FLOAT32,
  68. 'double': libgdf.GDF_FLOAT64,
  69. 'float32': libgdf.GDF_FLOAT32,
  70. 'float64': libgdf.GDF_FLOAT64,
  71. 'short': libgdf.GDF_INT16,
  72. 'long': libgdf.GDF_INT64,
  73. 'int': libgdf.GDF_INT32,
  74. 'int32': libgdf.GDF_INT32,
  75. 'int64': libgdf.GDF_INT64,
  76. }
  77. if dicc.get(type_name):
  78. return dicc[type_name]
  79. return libgdf.GDF_INT64
  80.  
  81. for key in dtypes:
  82. values.append( gdf_type(dtypes[key]))
  83.  
  84. print('>>>> dtyps for', dtypes.values())
  85. print(values)
  86. return values
  87.  
  88. def get_type_schema(path):
  89. format = path.split('.')[-1]
  90.  
  91. if format == 'parquet':
  92. return SchemaFrom.ParquetFile
  93. elif format == 'csv' or format == 'psv' or format.startswith("txt"):
  94. return SchemaFrom.CsvFile
  95.  
  96. def open_perf_table(table_ref):
  97. for key in table_ref.keys():
  98. sql = 'select * from main.%(table_name)s' % {"table_name": key.table_name}
  99. return pyblazing.run_query(sql, table_ref)
  100.  
  101. def run_gpu_workflow(quarter=1, year=2000, perf_file="", **kwargs):
  102.  
  103. import time
  104.  
  105. load_start_time = time.time()
  106.  
  107. names = gpu_load_names()
  108. acq_gdf = gpu_load_acquisition_csv(acquisition_path=acq_data_path + "/Acquisition_"
  109. + str(year) + "Q" + str(quarter) + ".txt")
  110.  
  111. gdf = gpu_load_performance_csv(perf_file)
  112.  
  113. load_end_time = time.time()
  114.  
  115. etl_start_time = time.time()
  116.  
  117. acq_gdf_results = merge_names(acq_gdf, names)
  118.  
  119. everdf_results = create_ever_features(gdf)
  120.  
  121. delinq_merge_results = create_delinq_features(gdf)
  122.  
  123. new_everdf_results = join_ever_delinq_features(everdf_results.columns, delinq_merge_results.columns)
  124.  
  125. joined_df_results = create_joined_df(gdf.columns, new_everdf_results.columns)
  126. del (new_everdf_results)
  127.  
  128. testdf_results = create_12_mon_features_union(joined_df_results.columns)
  129.  
  130. testdf = testdf_results.columns
  131. new_joined_df_results = combine_joined_12_mon(joined_df_results.columns, testdf)
  132. del (testdf)
  133. del (joined_df_results)
  134. perf_df_results = final_performance_delinquency(gdf.columns, new_joined_df_results.columns)
  135. del (gdf)
  136. del (new_joined_df_results)
  137.  
  138. final_gdf_results = join_perf_acq_gdfs(perf_df_results.columns, acq_gdf_results.columns)
  139. del (perf_df_results)
  140. del (acq_gdf_results)
  141.  
  142. final_gdf = last_mile_cleaning(final_gdf_results.columns)
  143.  
  144. etl_end_time = time.time()
  145.  
  146. return [final_gdf, (load_end_time - load_start_time), (etl_end_time - etl_start_time)]
Add Comment
Please, Sign In to add comment