Advertisement
Guest User

Untitled

a guest
Sep 15th, 2019
222
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.83 KB | None | 0 0
  1. import yaml
  2. from tkinter import messagebox, Label, Button, FALSE, Tk, Entry, Checkbutton, BooleanVar, StringVar
  3. import os
  4. from datetime import datetime
  5. from set_config import set_config
  6. from ftplib import FTP
  7. import pandas as pd
  8. from dateutil.parser import *
  9. from num2words import num2words
  10. import numpy as np
  11. #class Preprocessing
  12. # unit name extraction
  13. class from_ftp:
  14. #retreive files from ftp server, with login, adress, and path from yaml file
  15. path = ""
  16. login_path = ""
  17. remote_path = ""
  18. path_path = ""
  19. def __init__(self, **kwargs):
  20. self.start_str = kwargs.get('start_str')
  21. self.path = kwargs.get('path')
  22. self.str_date_start = kwargs.get('str_date_start')
  23. self.ftp = kwargs.get("ftp")
  24. self.remote_path = kwargs.get("remote_path")
  25. if not (os.path.exists(self.path)):
  26. os.mkdir(self.path)
  27. config_path = self.path + "/Config"
  28. if not(os.path.exists(config_path)):
  29. config = set_config(config_path)
  30. config.setup()
  31. self.login_path = config_path + "/login.yaml"
  32. self.data_path = kwargs.get('data_path')
  33. self.path_path = config_path + "/path.yaml"
  34. #checks: file type, whether the file has the right start, and if it isnt from today
  35. def get_file_date(self, name):
  36. split = name.split("_")
  37. day = split[-1][:-4]
  38. date = parse(split[-3] + "-" + split[-2] + "-" + day)
  39. return date.date()
  40. def filter_remote_files(self):
  41. relevant_files = []
  42. #date_start = datetime.now().date()
  43. now = datetime.now().date()
  44. #if self.str_date_start != None:
  45. date_start = parse(self.str_date_start).date()
  46. #else:
  47. #data_start = parse('2017-1-1').date()
  48. for name in self.ftp.nlst():
  49. if(name.startswith(self.start_str)):
  50. file_date = self.get_file_date(name)
  51. if(file_date < now and file_date >= date_start ):
  52. relevant_files.append(name)
  53. return relevant_files
  54. #retrieves the files from remote server
  55. def retrieve_files(self):
  56. if(self.data_path == None):
  57. with open(self.path_path, 'r') as stream:
  58. data_dict = yaml.safe_load(stream)
  59. data_path = data_dict.get('File')
  60. else:
  61. data_path = self.data_path
  62. data_path_files = []
  63. h_remote_files = []
  64.  
  65.  
  66.  
  67. print('BUILDING LOCAL DIR FILE LIST...')
  68. for file_name in os.listdir(data_path):
  69. data_path_files.append(file_name) # populate local dir list
  70. cmdcmd = 'CWD ' + self.remote_path
  71. self.ftp.sendcmd(cmdcmd)
  72. print('BUILDING REMOTE DIR FILE LIST...\n')
  73. h_remote_files = self.filter_remote_files()
  74.  
  75. h_diff = sorted(list(set(h_remote_files) - set(data_path_files))) # difference between two lists
  76.  
  77. for h in h_diff:
  78. with open(os.path.join(data_path,h), 'wb') as ftpfile:
  79. s = '0'
  80. print('File: ' + h)
  81. s = self.ftp.retrbinary('RETR ' + h, lambda d: ftpfile.write(d)) # retrieve file
  82. if str(s).startswith('226'): # comes from ftp status: '226 Transfer complete.'
  83. print ('\nOK\n') # print 'OK' if transfer was successful
  84. else:
  85. print (s) # if error, print retrbinary's return
  86. #try to login with current user info
  87. #try to login with current user info
  88. def try_login(self, username, password):
  89. print("Trying to login...")
  90. print (username)
  91. try:
  92. self.ftp.login(username, password)
  93. except Exception as e:
  94. messagebox.showinfo("-- ERROR --", "Please enter valid infomation!", icon="warning")
  95. #enter user info from yaml file
  96.  
  97. #main function, establishes shared variables, and makes sure the program can run, and all config is setup
  98. def ftp_sync(self):
  99.  
  100. username = "" #mutable variable for username
  101. password = ""#mutable var for pwd
  102. with open(self.login_path, 'r') as stream:
  103. login_info = yaml.safe_load(stream)
  104. username = login_info.get("Username")
  105. password = login_info.get("Password")
  106. print(username)
  107. self.try_login(username, password)
  108. #if login is sucsessful retrieve files
  109. self.retrieve_files()
  110. class Preprocessing_Base:
  111. # path, alarm_name, del_cols, intended_cols, expected_cols, cols_to_fix, deviation_cols, min_cols
  112. def __init__(self,**kwargs):
  113. self.path = kwargs.get('path')
  114. self.from_path = kwargs.get('from_path')
  115. self.alarm_name = kwargs.get('alarm_name')
  116. self.del_cols = kwargs.get('del_cols')
  117. self.intended_cols = kwargs.get('intended_cols')
  118. self.cols_to_fix = kwargs.get('cols_to_fix')
  119. self.deviation_cols = kwargs.get('deviation_cols')
  120. self.one_strs = kwargs.get("one_strs")
  121. self.temp_cols = kwargs.get("temp_cols")
  122. self.zero_strs = kwargs.get("zero_strs")
  123. self.binary_cols = kwargs.get("binary_cols")
  124. #derives unit_name from filename
  125. def unit_name(self, cols, file_n, df):
  126. if len(cols) == len(self.intended_cols):
  127. # Add the Unit_Names for the file_names
  128. unitName = file_n.split("_")[0]
  129. df["Unit_Name"] = unitName.upper()
  130. else:
  131. df = df.drop(df.columns[len(self.intended_cols):],axis=1)
  132. # Add the Unit_Names for the file_names
  133. unitName = file_n.split("_")[0]
  134. df["Unit_Name"] = unitName.upper()
  135. return unitName
  136. #//
  137. #sets values to string for entire column
  138. def df_to_string(self, column, df):
  139. #alarm strin
  140. df[column] = df[column].astype(str)
  141. #/
  142. #changes zeroes and ones to zerostr and onestr
  143. def binary_to_string(self, binary_col, df):
  144.  
  145. if(binary_col in df.columns):
  146. self.df_to_string(binary_col, df)
  147.  
  148. if(binary_col[-2] == '_'):
  149. key_col = binary_col[:-2]
  150. else:
  151. key_col = binary_col
  152. n = []
  153. for val in df[binary_col].values:
  154. if(val == '1'):
  155. n.append(self.one_strs.get(key_col))
  156. elif(val == '0'):
  157. n.append(self.zero_strs.get(key_col))
  158. else:
  159. n = df[binary_col]
  160. break
  161. df[binary_col] = pd.DataFrame(n)
  162. def binary_col_array(self, binary_cols, df):
  163. for binary_col in binary_cols:
  164. self.binary_to_string(binary_col, df)
  165. #old more specific code I dont feel like replacingough del_cols and sees if any are in df if any are they get yote
  166. def delete_cols(self, df, del_cols):
  167. # Delete unused columns if they exist
  168. for thisCol in del_cols:
  169. if thisCol in df.columns:
  170. df.drop(thisCol, axis=1, inplace=True)
  171. #print(thisCol + "dropped")
  172. #/ abs value of deviations basically useless and shouldnt be in base but I dont want to restructure so here it stays
  173. def take_abs_of_devs(self, df):
  174. for dev_col in self.deviation_cols:
  175. df[dev_col] = pd.DataFrame(np.absolute(df[dev_col].values))
  176. #*/
  177. #creates file based on cleaned up frame
  178. def create_file(self, result_dir, fileN, df):
  179. resultFrame = df
  180. if resultFrame.shape[0] > 0:
  181. k = result_dir+"\\"+fileN[:-4] +"_.csv"
  182. resultFrame.to_csv(k)
  183. #prepares lifetime value columns, but I didnt really understand that when I wrote it
  184. def prepare_col(self, name, zero, df):
  185. if(name in df.columns):
  186. print('ayayay')
  187. if df[name].dtype == "int64":
  188. n = np.ediff1d(df[name].values)
  189. n = np.append(zero, n)
  190. n = n.clip(min=0, max=1)
  191.  
  192. df[name] = pd.DataFrame(n)
  193. elif df[name].values[0].isdigit():
  194. name_int = [int(x) for x in df[name].values]
  195. n = np.ediff1d(name_int)
  196. n = np.append(zero, n)
  197. n = n.clip(min=0, max=1)
  198.  
  199. df[name] = pd.DataFrame(n)
  200. #prepares an array of lifetime value cols
  201. def prepare_arr_of_cols(self, zero, cols_to_fix, df):
  202. for col in cols_to_fix:
  203. self.prepare_col(col, zero, df)
  204. #finds files that are different between result dir and data directory
  205. def find_different(self, data_path, result_dir):
  206. already_processed = []
  207. file_names = []
  208. ##### Make sure to read only the csv files in directory
  209. with os.scandir(result_dir) as listOfEntries:
  210. for entry in listOfEntries:
  211. if entry.name[-5].isdigit():
  212. already_processed.append(entry.name[:-6] + ".csv")
  213. else :
  214. already_processed.append(entry.name[:-5] + ".csv")
  215. with os.scandir(data_path) as listOfEntries:
  216. for entry in listOfEntries:
  217. # print all entries that are files
  218.  
  219. if entry.is_file() and entry.name[-4:] == ".csv":
  220.  
  221. file_names.append(entry.name)
  222.  
  223. file_names = sorted(list(set(file_names) - set(already_processed)))
  224. return file_names
  225. #names a multiple unit NAMEONE and NAMETWO post split
  226. def delta_t(self, temp_cols, df):
  227. df[temp_cols["out"]] = df[temp_cols["out"]].astype(float)
  228. df[temp_cols["in"]] = df[temp_cols["in"]].astype(float)
  229.  
  230. return(df[temp_cols["out"]] - df[temp_cols["in"]])
  231. def unit_name_multiple(self, data_path, df, i):
  232. location = os.path.basename(os.path.normpath(data_path))
  233. name = location.split(" ")[0] + num2words(i)
  234. df["Unit_Name"] = name.upper()
  235.  
  236. df["Location"] = location.upper()
  237. df["Category"] = "Field"
  238.  
  239. def unit_name_from_path(self, data_path, df):
  240. name = os.path.basename(os.path.normpath(data_path))
  241. name = name.split(" ")[0]
  242. df["Unit_Name"] = name.upper()
  243. df["Location"] = name.upper()
  244. df["Category"] = "Field"
  245. if not self.temp_cols == None:
  246. df["Delta_T"] = self.delta_t(self.temp_cols, df)
  247.  
  248. #extends beyond just multiples
  249. def del_row_with_dashes(self, df):
  250. for col in df.columns[2:]:
  251. df.drop(df.loc[df[col] == '---'].index, inplace =True)
  252.  
  253. def create_multiple_file(self, df, result_dir, fileN, i, intended_cols_i, data_path):
  254. try:
  255. split_df = df[intended_cols_i].copy()
  256. split_df.columns = self.intended_cols[:-1]
  257. self.del_row_with_dashes(split_df)
  258. zero = np.array([0])
  259.  
  260. self.prepare_arr_of_cols(zero, self.cols_to_fix, split_df)
  261. if not self.temp_cols == None:
  262. split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
  263. self.unit_name_multiple(data_path, split_df, i)
  264. self.binary_col_array(self.binary_cols, split_df)
  265.  
  266. split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
  267. k = result_dir+"\\"+fileN[:-4] +"_" + str(i) +".csv"
  268. split_df.to_csv(k)
  269. except Exception as e:
  270. pass
  271. #meant to format multiple cols but would probably just end up being overwritten every single module so its abstract
  272. def format_multiple_cols(self, df, fileN, result_dir, data_path, i, last_int):
  273. colnames = df.columns.values.tolist()
  274. intended_cols_i =['Time', 'Date'] + [col + "_" + str(i) for col in self.intended_cols[2:-1]]
  275. print(len(intended_cols_i))
  276.  
  277. #if not self.binary_cols == None:
  278. #intended_cols_i.append("SW_VERSN")
  279. #df.columns = intended_cols_i
  280. zero = np.array([0])
  281. #self.df_to_string(alarm_i, df)
  282. self.unit_name_multiple(data_path, df, i)
  283. self.create_multiple_file(df, result_dir, fileN, i, intended_cols_i, data_path)
  284. if i < int(last_int):
  285. i+= 1
  286. self.format_multiple_cols(df, fileN, result_dir, data_path, i, last_int)
  287. #I should probably move this into my data files instead of the module since it is the main driving fnction
  288. def format_cols(self, cols, df, fileN, result_dir, data_path):
  289. if not self.alarm_name == None:
  290. self.df_to_string(self.alarm_name, df)
  291. if df.shape[0] > 1: #Ignore changing the files with only one row
  292. cols = df.columns
  293. #retrieve unit name
  294. if(self.from_path == True):
  295. self.unit_name_from_path(data_path, df)
  296. else:
  297. self.unit_name(df.columns, fileN, df)
  298.  
  299. # Fix the Gallons Columns, Successful Ignitions, Failed Ignitions
  300. # ,Flame Failures, Burner Minutes
  301. zero = np.array([0])
  302. print(fileN)
  303. self.prepare_arr_of_cols(zero, self.cols_to_fix, df)
  304. # Taking the Absolute Value of the both the Deviations for easy
  305. # analysis
  306. if not self.deviation_cols == None:
  307. self.take_abs_of_devs(df)
  308. if not self.binary_cols == None:
  309. self.binary_col_array(self.binary_cols, df)
  310. print(fileN, len(cols), cols[-1], cols[0])
  311.  
  312. #Save only the files that contain info
  313. self.create_file(result_dir, fileN, df)
  314. else:
  315. print("\n\n\n*******\n", fileN, " = not considered in analysis\n because it has only 1 line of data")
  316. #another main driving function of the module, meant to loop through the new files and process them
  317. def check_if_multiple(self, df, last_int):
  318. if(last_int.isdigit()):
  319. if(int(last_int) > 1):
  320. return True
  321. def read_files(self, data_path, result_dir):
  322. min_cols = len(self.intended_cols)
  323. file_names = self.find_different(data_path, result_dir)
  324. for fileN in file_names:
  325. #create df
  326. try:
  327. df = pd.read_csv(data_path + "\\" + fileN
  328. )
  329. except Exception as e:
  330. continue
  331. if not self.del_cols== None:
  332. self.delete_cols(df, self.del_cols)
  333.  
  334. cols = df.columns
  335. colnames = df.columns.values.tolist()
  336. last_col = colnames[-12]
  337. last_int = last_col[-1]
  338. if( len(cols) == min_cols):
  339. df.columns = self.intended_cols
  340. self.format_cols(cols, df, fileN, result_dir, data_path)
  341. elif(self.check_if_multiple(df, last_int)):
  342. self.format_multiple_cols(df, fileN, result_dir, data_path, 1, last_int)
  343.  
  344.  
  345. #ipdb.set_trace()
  346. #main main driving function, starts the other two
  347. def main(self, data_path, result_dir):
  348. config_path = self.path + "/Config"
  349. login_path = config_path + "/login.yaml"
  350. path_path = config_path + "/path.yaml"
  351. result_dir = self.path + result_dir
  352. if os.path.exists(result_dir) == False:
  353. os.mkdir(result_dir)
  354. self.read_files(data_path, result_dir)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement