Advertisement
Guest User

Untitled

a guest
Feb 26th, 2020
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.67 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 2,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "path_inputs = \"/data/master/finance/xrro/data/\"\n",
  10. "usuario = \"E017542\"\n",
  11. "b1_name = \"t_xrro_operation_attributes\"\n",
  12. "b2_name = \"t_xrro_customer_attributes\"\n",
  13. "b3_name = \"t_xrro_financial_metrics_ml\"\n",
  14. "b4_name = \"t_xrro_economic_metrics_ml\"\n",
  15. "b5_name = \"t_xrro_regulatory_metrics_ml\"\n",
  16. "b7_name = \"t_xrro_prospective_metrics_ml\"\n",
  17. "primary_key = [ \"main_operation_id\"]"
  18. ]
  19. },
  20. {
  21. "cell_type": "code",
  22. "execution_count": 3,
  23. "metadata": {},
  24. "outputs": [],
  25. "source": [
  26. "df_B1 = spark.read.parquet(path_inputs + b1_name)\n",
  27. "df_B2 = spark.read.parquet(path_inputs + b2_name)\n",
  28. "df_B3 = spark.read.parquet(path_inputs + b3_name)\n",
  29. "df_B4 = spark.read.parquet(path_inputs + b4_name)\n",
  30. "df_B5 = spark.read.parquet(path_inputs + b5_name)\n",
  31. "df_B7 = spark.read.parquet(path_inputs + b7_name)"
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": null,
  37. "metadata": {},
  38. "outputs": [],
  39. "source": [
  40. "\n",
  41. "#Dataset1 prospectivo\n",
  42. "\n",
  43. "\"\"\"\n",
  44. "b1_list = primary_key + primary_key_2 + [\"cutoff_date\",\"main_efan_area_id\",\"operation_holding_situation_id\",\"block_epigraph_type\"]\n",
  45. "b2_list = primary_key_2 + [\"local_commercial_segment_id\"]\n",
  46. "b7_list = primary_key + [\"cust_nii_stk_lc_amount\", \"fundg_nii_stk_lc_amount\", \"net_fee_stk_lc_amount\",\\\n",
  47. " \"rof_stk_lc_amount\",\"tax_stk_lc_amount\"]\n",
  48. "b4_list = primary_key + [\"ek_crr_eop_lc_amount\", \"ek_or_eop_lc_amount\",\"ec_el_ttc_eop_lc_amount\"]\n",
  49. "b5_list = primary_key + [\"rk_crr_eop_lc_amount\", \"rk_or_eop_lc_amount\"]\n",
  50. "df_B1_select = df_B1.select(b1_list)\n",
  51. "df_B2_select = df_B2.select(b2_list)\n",
  52. "df_B7_select = df_B7.select(b7_list)\n",
  53. "df_B4_select = df_B4.select(b4_list)\n",
  54. "df_B5_select = df_B5.select(b5_list)\n",
  55. "cruce1 = df_B7_select.join(df_B1_select, on = primary_key, how = \"left\")\n",
  56. "cruce2 = cruce1.join(df_B4_select, on = primary_key, how = \"left\")\n",
  57. "cruce3 = cruce2.join(df_B5_select, on = primary_key, how = \"left\")\n",
  58. "cruce3.toPandas().to_csv(\"Prueba1.txt\",sep=';',header=True,index=False,mode='w',decimal=',')\n",
  59. "#list_filter = [\"CE0061235\"]\n",
  60. "#salida_filtrada = cruce3.filter(cruce3[\"main_efan_area_id\"].isin(list_filter))\n",
  61. "#salida_filtrada.toPandas().to_csv(\"salida_filtrada.txt\",sep=';',header=True,index=False,mode='w',decimal=',')\n",
  62. "#cruce3.groupBy([\"cutoff_date\",\"main_efan_area_id\",\"block_epigraph_type\",\"operation_holding_situation_id\"]).sum().toPandas()\n",
  63. "\"\"\"\""
  64. ]
  65. },
  66. {
  67. "cell_type": "code",
  68. "execution_count": 4,
  69. "metadata": {},
  70. "outputs": [],
  71. "source": [
  72. "#GF = 93\n",
  73. "#GTB= 94\n",
  74. "\n",
  75. "Producto=[\"93\"]\n",
  76. "Fecha=[\"2019-04-30\"]\n",
  77. "B1_Filtrado=df_B1.filter(df_B1[\"operation_repository_id\"].isin(Producto)).filter(df_B1[\"cutoff_date\"].isin(Fecha))\n",
  78. "B3_Filtrado=df_B3.filter(df_B3[\"operation_repository_id\"].isin(Producto)).filter(df_B3[\"cutoff_date\"].isin(Fecha))\n",
  79. "B4_Filtrado=df_B4.filter(df_B4[\"operation_repository_id\"].isin(Producto)).filter(df_B4[\"cutoff_date\"].isin(Fecha))\n",
  80. "B5_Filtrado=df_B5.filter(df_B5[\"operation_repository_id\"].isin(Producto)).filter(df_B5[\"cutoff_date\"].isin(Fecha))\n",
  81. "B7_Filtrado=df_B7.filter(df_B7[\"operation_repository_id\"].isin(Producto)).filter(df_B7[\"cutoff_date\"].isin(Fecha))\n",
  82. "\n",
  83. "#B7_max_odate = [str(row['odate_date']) for row in B7_Filtrado.select('odate_date').distinct().collect()]\n",
  84. "#B7_Filtrado_max_odate= B7_Filtrado.filter(B7_Filtrado[\"odate_date\"].isin(B7_max_odate))\n",
  85. "\n"
  86. ]
  87. },
  88. {
  89. "cell_type": "code",
  90. "execution_count": 20,
  91. "metadata": {},
  92. "outputs": [],
  93. "source": [
  94. "#Dataset1 prospectivo 2\n",
  95. "\n",
  96. "b1_list = primary_key + [\"cutoff_date\",\"main_efan_area_id\",\"operation_holding_situation_id\",\"block_epigraph_type\"]\n",
  97. "b4_list = primary_key + [\"ek_crr_eop_lc_amount\", \"ek_or_eop_lc_amount\",\"ec_el_ttc_eop_lc_amount\"]\n",
  98. "b5_list = primary_key + [\"rk_crr_eop_lc_amount\", \"rk_or_eop_lc_amount\"]\n",
  99. "df_B1_select = B1_Filtrado.select(b1_list)\n",
  100. "df_B4_select = B4_Filtrado.select(b4_list)\n",
  101. "df_B5_select = B5_Filtrado.select(b5_list)\n",
  102. "cruce1 = B7_Filtrado.drop('cutoff_date').join(df_B1_select, on = primary_key, how = \"inner\")\n",
  103. "cruce2 = cruce1.join(df_B4_select, on = primary_key, how = \"inner\")\n",
  104. "cruce3 = cruce2.join(df_B5_select, on = primary_key, how = \"inner\")\n"
  105. ]
  106. },
  107. {
  108. "cell_type": "code",
  109. "execution_count": 27,
  110. "metadata": {},
  111. "outputs": [],
  112. "source": [
  113. "cruce3.toPandas().to_csv(\"Abril.txt\",sep=';',header=True,index=False,mode='w',decimal=',')"
  114. ]
  115. },
  116. {
  117. "cell_type": "code",
  118. "execution_count": 24,
  119. "metadata": {},
  120. "outputs": [
  121. {
  122. "data": {
  123. "text/plain": [
  124. "583"
  125. ]
  126. },
  127. "execution_count": 24,
  128. "metadata": {},
  129. "output_type": "execute_result"
  130. }
  131. ],
  132. "source": [
  133. "cruce1.count()"
  134. ]
  135. },
  136. {
  137. "cell_type": "code",
  138. "execution_count": 19,
  139. "metadata": {},
  140. "outputs": [
  141. {
  142. "data": {
  143. "text/plain": [
  144. "[Row(operation_repository_id='93')]"
  145. ]
  146. },
  147. "execution_count": 19,
  148. "metadata": {},
  149. "output_type": "execute_result"
  150. }
  151. ],
  152. "source": [
  153. "B7_Filtrado.select('operation_repository_id').distinct().collect()"
  154. ]
  155. },
  156. {
  157. "cell_type": "code",
  158. "execution_count": 26,
  159. "metadata": {},
  160. "outputs": [
  161. {
  162. "data": {
  163. "text/plain": [
  164. "583"
  165. ]
  166. },
  167. "execution_count": 26,
  168. "metadata": {},
  169. "output_type": "execute_result"
  170. }
  171. ],
  172. "source": [
  173. "B1_Filtrado.dropDuplicates().count()"
  174. ]
  175. },
  176. {
  177. "cell_type": "code",
  178. "execution_count": null,
  179. "metadata": {},
  180. "outputs": [],
  181. "source": []
  182. }
  183. ],
  184. "metadata": {
  185. "kernelspec": {
  186. "display_name": "local1-spark2python3",
  187. "language": "python",
  188. "name": "spark-python-local1-spark2python3"
  189. },
  190. "language_info": {
  191. "codemirror_mode": {
  192. "name": "ipython",
  193. "version": 3
  194. },
  195. "file_extension": ".py",
  196. "mimetype": "text/x-python",
  197. "name": "python",
  198. "nbconvert_exporter": "python",
  199. "pygments_lexer": "ipython3",
  200. "version": "3.5.2"
  201. }
  202. },
  203. "nbformat": 4,
  204. "nbformat_minor": 2
  205. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement