Advertisement
Danila_lipatov

Distribution

Sep 2nd, 2022 (edited)
167
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.92 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import openpyxl as pxl
  5. import seaborn as sns
  6. import statistics
  7. from collections import OrderedDict
  8. from dateutil.relativedelta import relativedelta
  9. from datetime import datetime
  10. import textwrap
  11.  
  12.  
  13. def get_val(df):
  14.     x = 0
  15.     for i in df:
  16.         x = i
  17.     return x
  18.  
  19.  
  20.  
  21.  
  22. def graph_id_rating(df):
  23.     df = pd.DataFrame(df.groupby(['рейтинг', 'флаг']).count()).reset_index().set_index('рейтинг')
  24.     df.drop('report_date', inplace=True, axis=1)
  25.     df = df.pivot(columns='флаг')
  26.     return df
  27.  
  28. #function for create df year per id
  29. def graph_year_id(df):
  30.     df.drop('флаг', inplace=True, axis=1)
  31.     df.drop('рейтинг', inplace=True, axis=1)
  32.     df = pd.DataFrame(df.groupby(['report_date']).count())
  33.     df = df.reset_index()
  34.     df['report_date'] = pd.DatetimeIndex(df['report_date']).year
  35.     df = df.set_index('report_date')
  36.     return df
  37.  
  38.  
  39. # 1y
  40. # 2y
  41. # 3m
  42.  
  43. # {'scale': 'month', 'scale_factor': 4}
  44. # {'scale': 'year', 'scale_factor': 2}
  45. # {'scale': 'quartal', 'scale_factor': 1 or 2}
  46.  
  47.  
  48.  
  49.  
  50. #function for different check
  51. #TODO check solution using udf functions, not iterate over dicts
  52. def step_graph(df, step):
  53.     year_dict = {}
  54.     temp_2 = {}
  55.     temp = {}
  56.     for date, i in df.iterrows():
  57.         if i[1] not in year_dict:
  58.             year_dict[i[1]] = []
  59.             temp_2[i[1]] = []
  60.         year_dict[i[1]].append(i[0].strftime('%Y-%m'))
  61.     value_get = ''
  62.     for key, values in year_dict.items():
  63.         value_get = year_dict[key][0]
  64.         while value_get < year_dict[key][len(year_dict[key]) - 1]:
  65.             # TODO CHECK WHILE OR FOR TO ITERATE OVER THE ALL DATES
  66.             if step['scale'] == 'month':
  67.                 value_get = datetime.strptime(value_get, '%Y-%m')
  68.                 value_get = value_get + (relativedelta(months=step['scale_factor']))
  69.                 value_get = value_get.strftime('%Y-%m')
  70.             elif step['scale'] == 'year':
  71.                 value_get = datetime.strptime(value_get, '%Y-%m')
  72.                 value_get = value_get + (relativedelta(years=step['scale_factor']))
  73.                 value_get = value_get.strftime('%Y-%m')
  74.             elif step['scale'] == "quartal":
  75.                 value_get = datetime.strptime(value_get, '%Y-%m')
  76.                 value_get = value_get + (relativedelta(months=step['scale_factor'] * 3))    #todo redact this step
  77.                 value_get = value_get.strftime('%Y-%m')
  78.             if value_get in values:
  79.                 temp_2[key].append(value_get)
  80.     check = []
  81.     df_1 = pd.DataFrame(columns=df.columns)
  82.     df_1 = df_1.dropna()
  83.     for valeu in temp_2.values():
  84.         check.append(len(valeu))
  85.     for x in check:
  86.         if x not in temp:
  87.             temp[x] = []
  88.         temp[x].append(check.count(x))
  89.     for key, value in temp.items():
  90.         df_1.at[key] = len(value)
  91.         print(key, len(value))
  92.     df_1.drop('report_date', inplace=True, axis=1)
  93.     df_1 = df_1.sort_index()
  94.     print(df_1)
  95.     return df_1
  96.  
  97.  
  98. #reading excel file
  99. step = {'scale': 'year', 'scale_factor': 2}
  100. df = pd.read_excel("cases_sample_rebuilt.xlsx", sheet_name="Лист3")
  101.  
  102. year_dict = {}  #dict for create df
  103. temp = []   #mass for count number of id/years/etc
  104. val = []    #mass for median, mean, etc
  105. for date, i in df.iterrows():
  106.     if i[1] not in year_dict:
  107.         year_dict[i[1]] = []
  108.     temp.append(i[0].strftime('%Y'))
  109.     year_dict[i[1]].append(i[0].strftime('%Y'))
  110.  
  111. df_1 = pd.DataFrame(columns=df.columns) #creating df
  112. df_1 = df_1.dropna()
  113.  
  114.  
  115. for key, value in year_dict.items():
  116.     df_1.at[key] = len(set(temp)) - len(value)  #key --- (value = count of id)
  117.     val.append(len(set(temp)) - len(value))
  118. print(year_dict)
  119.  
  120. med = get_val(df_1.median())    #median
  121. mean = get_val(df_1.mean())     #mean
  122. mod = (statistics.mode(val))    #mod
  123. sted = get_val(df_1.std())      #std
  124.  
  125. types = ['density', 'count', 'percent', 'frequency', 'dependence id-year', 'dependence year-id', 'step']      #todo add step checker
  126. for i in types:
  127.     fig = plt.figure(figsize=(10, 10))
  128.     if i == 'dependence id-year':
  129.         df_1 = graph_id_rating(df)
  130.         df_1.plot.bar(stacked=True)
  131.         plt.xlabel('Dates')
  132.         plt.ylabel('number of unique objects')
  133.     elif i == 'dependence year-id':
  134.         df_1 = graph_year_id(df)
  135.         df_1.plot.bar()
  136.         plt.ylabel('number of unique objects at date')
  137.         plt.xlabel('Number of dates')
  138.     elif i == 'step' :
  139.         df_1 = step_graph(df, step)
  140.         df_1.plot.bar()
  141.         plt.ylabel('number of unique objects')
  142.     else:
  143.         sns.histplot(data=df_1, kde=True,bins = list(range(0, len(set(temp)), 1)), stat=i, legend=False)
  144.     plt.xlabel(
  145.         f'median:{med}, mod:{mod}, mean:{round(mean, 2)}, sted:{round(sted, 2)}'
  146.         , labelpad=20, fontsize=5, loc='center')
  147.     plt.savefig(f'picture{i}', dpi=350, bbox_inches='tight')
  148.     plt.close(fig)
  149.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement