Advertisement
Guest User

Untitled

a guest
Feb 27th, 2017
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.40 KB | None | 0 0
  1. import pandas as pd
  2. import io
  3.  
  4.  
  5. file_name = "RSEG_for_edd_v1.csv"
  6. output_file = file_name.replace(".csv", "_final.csv")
  7. delimiter = b","
  8.  
  9.  
  10. def read_data(file_name):
  11. data = []
  12. with open(file_name, "rb") as f:
  13. data = f.readlines()
  14.  
  15. data = [row.strip(b"\n").strip(b"\r") for row in data]
  16. return data
  17.  
  18.  
  19. def encode_in_unicode(data, delimiter):
  20. for index, row in enumerate(data):
  21. temp_row = row.split(delimiter)
  22. # print(len(temp_row))
  23.  
  24. for col_index, col in enumerate(temp_row):
  25. # print(len(temp_row[col_index]))
  26. try:
  27. __ = temp_row[col_index].decode("utf-8")
  28. except UnicodeDecodeError:
  29. print(1, end="-")
  30. temp_row[col_index] = temp_row[col_index].decode(
  31. "latin-1").encode("utf-8")
  32. # print(len(temp_row[col_index]))
  33. # print()
  34.  
  35. data[index] = delimiter.join(temp_row).decode("utf-8")
  36.  
  37. data = "\n".join(data)
  38. return data
  39.  
  40.  
  41. def to_df(data):
  42. data = data.replace('\ufeff', "")
  43. data = pd.read_table(io.StringIO(data))
  44. # data = pd.DataFrame(data[1:], columns=data[0])
  45. # data = pd
  46. print(data.head())
  47. return data
  48.  
  49.  
  50. if __name__ == "__main__":
  51. # file_name = "a.txt"
  52. data = read_data(file_name)
  53. data = encode_in_unicode(data, delimiter)
  54. data = to_df(data)
  55. data.to_csv(output_file, index=False, encoding="utf-8")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement