Advertisement
Guest User

Untitled

a guest
Jul 16th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.68 KB | None | 0 0
  1. package dsa_assignment4;
  2.  
  3. import java.io.PrintWriter;
  4. import java.nio.file.Path;
  5. import java.nio.file.Paths;
  6. import java.util.Arrays;
  7. import java.util.Deque;
  8. import java.util.LinkedList;
  9. import java.util.PriorityQueue;
  10. import java.util.Scanner;
  11.  
  12. import org.apache.log4j.Logger;
  13.  
  14. import dsa_assignment4.CsvFormatter.RowComparator;
  15.  
  16. /**
  17. * A class containing only static methods to externally sort simplified CSV
  18. * files
  19. */
  20. public class CsvUtils
  21. {
  22. private static final Logger logger = Logger.getLogger(CsvUtils.class);
  23.  
  24. // Your "data" directory will be at the top level of your Eclipse
  25. // project directory for this assignment: do not change the name
  26. // or put it anywhere else: the marking software will cause your
  27. // program to fail if it tries to read or write any files outside
  28. // this directory
  29. private static final Path dataDir = Paths.get("data");
  30.  
  31. /**
  32. * For marking purposes
  33. *
  34. * @return Your student id
  35. */
  36. public static String getStudentID()
  37. {
  38. //change this return value to return your student id number, e.g.
  39. // return "1234567";
  40. return "1884734";
  41. }
  42.  
  43. /**
  44. * For marking purposes
  45. *
  46. * @return Your name
  47. */
  48. public static String getStudentName()
  49. {
  50. //change this return value to return your name, e.g.
  51. // return "John Smith";
  52. return "JACOB WILSON";
  53. }
  54.  
  55. /**
  56. * An accessor method to return the path of your data directory
  57. *
  58. * @return the path to your data directory
  59. */
  60. public static Path getDataDir()
  61. {
  62. return dataDir;
  63. }
  64.  
  65. /**
  66. * A sample method to show the basic mechanism for reading and writing CSV
  67. * files using the CsvFormatter class. This just copies the input file to
  68. * the output file with no changes. However it has to make sure that the
  69. * output file is created with the correct CSV header.
  70. *
  71. * @param fromPath
  72. * The path of the CSV file to read from
  73. * @param toPath
  74. * The path of the CSV file to write to
  75. * @return true if it manages to complete without throwing exceptions (if
  76. * this were an empty method that you had to implement as part of
  77. * this assignment, you should leave the return value as false until
  78. * you had completed it to avoid unnecessary testing of an
  79. * unimplemented method
  80. * @throws Exception
  81. * if anything goes wrong, e.g. if you can't open either file,
  82. * can't read from the fromPath file, can't write to the toPath
  83. * file, if the from file does not match the requirements of the
  84. * simplified CSV file format, etc.
  85. */
  86. public static boolean copyCsv(Path fromPath, Path toPath)
  87. throws Exception
  88. {
  89. // Open both the from and the to files using a "try-with-resource" pattern
  90. // This ensures that, no matter what happens in terms of returns or exceptions,
  91. // both files will be correctly closed automatically
  92. try (Scanner from = new Scanner(fromPath); PrintWriter to = new PrintWriter(toPath.toFile()))
  93. {
  94. // Setup the CSV format from the "from" file
  95. CsvFormatter formatter = new CsvFormatter(from);
  96.  
  97. // Output the CSV header row to the "to" file
  98. formatter.writeHeader(to);
  99.  
  100. // copy each non-header row from the "from" file to the "to" file
  101. String[] row;
  102. while ((row = formatter.readRow(from)) != null)
  103. formatter.writeRow(to, row);
  104. }
  105. return true;
  106. }
  107.  
  108. /**
  109. * Split an (unordered) CSV file into separate smaller CSV files (runs)
  110. * containing sorted runs of row, where the rows are sorted in ascending
  111. * order of the column identified by the <code>columnName</code> parameter.
  112. * This is intended to be the first stage of a merge sort which produces
  113. * sorted runs that can then be merged together.
  114. * <p>
  115. * This code should work on truly huge files: far larger than we can hold in
  116. * memory at the same time. To simulate this without using huge files, we
  117. * impose a limit on the size of each run, given by the
  118. * <code>numRowLimit</code> parameter. Further, NO internal sort algorithms
  119. * should be used: e.g. Arrays.sort, Collections.sort, SortedList etc.
  120. * Instead a {@link PriorityQueue} must be used to generate the sorted runs:
  121. * Have a loop. Inside the loop, read in a maximum of
  122. * <code>numRowLimit</code> rows from the input and insert them into the
  123. * priority queue and then extract them in order and write them out to a new
  124. * split file.
  125. * </p>
  126. * <p>
  127. * The split file should be a sibling (i.e. in the same directory) as the
  128. * input file and have a name which is "temp_00000_" followed by the name of
  129. * the input file, where the "00000" is replace by a sequence number:
  130. * "00000" for the first split file, "00001" for the second etc.
  131. * </p>
  132. *
  133. * @param fromPath
  134. * The relative path where the input file is
  135. * @param columnName
  136. * The header name of the column used for sorting
  137. * @param numRowLimit
  138. * The maximum number of value rows (not including the header
  139. * row) that can be written into each split file
  140. * @return the <code>Path[]</code> of paths for the full list of split files created
  141. * @throws Exception
  142. * If anything goes wrong with opening, reading or writing the
  143. * files, or if the input file does not match the simplified CSV
  144. * requirements.
  145. */
  146. public static Path[] splitSortCsv(Path fromPath, String columnName, int numRowLimit)
  147. throws Exception
  148. {
  149. Deque<Path> pathDeque = new LinkedList<>();
  150.  
  151. // WRITE YOUR CODE HERE AND REPLACE THE RETURN STATEMENT
  152. try (Scanner from = new Scanner(fromPath))
  153. {
  154. CsvFormatter formatter = new CsvFormatter(from);
  155. CsvFormatter.RowComparator comparator = formatter.new RowComparator(columnName);
  156. int tempNum = 0;
  157.  
  158. while (from.hasNextLine())
  159. {
  160. tempNum++;
  161. PriorityQueue<String[]> queue = new PriorityQueue<>(comparator);
  162.  
  163. for (int i = 0; i < numRowLimit; i++)
  164. {
  165. if (from.hasNextLine())
  166. {
  167. queue.add(formatter.readRow(from));
  168. }
  169. else
  170. {
  171. break;
  172. }
  173. }
  174.  
  175. Path toPath = fromPath.resolveSibling(String.format("temp_%05d_%s", tempNum, fromPath.getFileName()));
  176.  
  177. try (PrintWriter to = new PrintWriter(toPath.toFile()))
  178. {
  179. formatter.writeHeader(to);
  180. while (!queue.isEmpty())
  181. {
  182. formatter.writeRow(to, queue.remove());
  183. }
  184. }
  185.  
  186. pathDeque.add(toPath);
  187. }
  188. }
  189.  
  190. return pathDeque.toArray(new Path[0]);
  191. }
  192.  
  193. /**
  194. * Merge two ordered input CSV files into a single ordered output CSV file
  195. *
  196. * The two input CSV files must be already ordered on the column specified
  197. * by <code>columnName</code> and must have the same CSV format (same number
  198. * of columns, same headers in the same order) The output file must
  199. * similarly be of the same CSV format and ordered on the same column.
  200. *
  201. * @param file1Path
  202. * The relative path of the first input file
  203. * @param file2Path
  204. * The relative path of the second input file
  205. * @param columnName
  206. * The column to order the output file on and, upon which, both
  207. * input files are ordered
  208. * @param outputPath
  209. * The relative path of the output file
  210. * @return true, if this method has been implemented. If it has not yet been
  211. * implemented, then it returns false and this is used to cause the
  212. * unit test to fail early without doing a lot of unnecessary work
  213. * @throws Exception
  214. * If anything goes wrong with opening, reading or writing the
  215. * files, or if the input files do not match the simplified CSV
  216. * requirements or have different CSV formats
  217. */
  218. public static boolean mergePairCsv(Path file1Path, Path file2Path, String columnName, Path outputPath)
  219. throws Exception
  220. {
  221. // WRITE YOUR CODE HERE AND REPLACE THE RETURN STATEMENT
  222. try (Scanner from1 = new Scanner(file1Path); Scanner from2 = new Scanner(file2Path); PrintWriter to = new PrintWriter(outputPath.toFile()))
  223. {
  224. CsvFormatter formatter1 = new CsvFormatter(from1);
  225. CsvFormatter.RowComparator comparator = formatter1.new RowComparator(columnName);
  226. CsvFormatter formatter2 = new CsvFormatter(from2);
  227.  
  228. if (Arrays.equals(formatter1.getHeaderStrings(), formatter2.getHeaderStrings()))
  229. {
  230. formatter1.writeHeader(to);
  231.  
  232. String[] nextLineFile1 = formatter1.readRow(from1);
  233. String[] nextLineFile2 = formatter2.readRow(from2);
  234.  
  235. while (nextLineFile1 != null && nextLineFile2 != null)
  236. {
  237. if (comparator.compare(nextLineFile1, nextLineFile2) < 0)
  238. {
  239. formatter1.writeRow(to, nextLineFile1);
  240. nextLineFile1 = formatter1.readRow(from1);
  241. }
  242. else if (comparator.compare(nextLineFile1, nextLineFile2) > 0)
  243. {
  244. formatter1.writeRow(to, nextLineFile2);
  245. nextLineFile2 = formatter2.readRow(from2);
  246. }
  247. else
  248. {
  249. formatter1.writeRow(to, nextLineFile1);
  250. formatter1.writeRow(to, nextLineFile2);
  251.  
  252. nextLineFile1 = formatter1.readRow(from1);
  253. nextLineFile2 = formatter2.readRow(from2);
  254. }
  255. }
  256.  
  257. while (nextLineFile1 != null)
  258. {
  259. formatter1.writeRow(to, nextLineFile1);
  260. nextLineFile1 = formatter1.readRow(from1);
  261. }
  262.  
  263. while (nextLineFile2 != null)
  264. {
  265. formatter1.writeRow(to, nextLineFile2);
  266. nextLineFile2 = formatter2.readRow(from2);
  267. }
  268. }
  269. else
  270. {
  271. throw new Exception("The files do not have matching headers");
  272. }
  273. }
  274.  
  275. return true;
  276. }
  277.  
  278. /**
  279. * Merge a list of ordered input CSV files into a single ordered output CSV
  280. * file
  281. * <p>
  282. * The input CSV files must be already ordered on the column specified by
  283. * <code>columnName</code> and must have the same CSV format (same number of
  284. * columns, same headers in the same order) The output file must similarly
  285. * be of the same CSV format and ordered on the same column.
  286. * </p>
  287. * <p>
  288. * This method should merge all the files together by calling
  289. * <code>mergePairCsv(...)</code> on pairs of files, starting with those on
  290. * <code>pathList</code>, producing larger and larger intermediate file
  291. * until the last pair-wise merge is used to produce the output file.
  292. * </p>
  293. *
  294. * @param pathList
  295. * An array of relative paths of the input files
  296. * @param columnName
  297. * The column to order the output file on and, upon which, both
  298. * input files are ordered
  299. * @param outputPath
  300. * The relative path of the output file
  301. * @return true, if this method has been implemented. If it has not yet been
  302. * implemented, then it returns false and this is used to cause the
  303. * unit test to fail early without doing a lot of unnecessary work
  304. * @throws Exception
  305. * If anything goes wrong with opening, reading or writing the
  306. * files, or if the input files do not match the simplified CSV
  307. * requirements or have different CSV formats
  308. */
  309. public static boolean mergeListCsv(Path[] pathList, String columnName, Path outputPath)
  310. throws Exception
  311. {
  312. Deque<Path> paths = new LinkedList<>(Arrays.asList(pathList));
  313.  
  314. // WRITE YOUR CODE HERE AND REPLACE THE RETURN STATEMENT
  315. int tempNumber = 0;
  316.  
  317. while (paths.size() > 1)
  318. {
  319. Path toMerge1 = paths.remove();
  320. Path toMerge2 = paths.remove();
  321. Path output;
  322.  
  323. if (paths.isEmpty())
  324. {
  325. output = outputPath;
  326. }
  327. else
  328. {
  329. output = outputPath.resolveSibling(String.format("temp_%05d_%s", tempNumber, outputPath.getFileName()));
  330. }
  331.  
  332. mergePairCsv(toMerge1, toMerge2, columnName, output);
  333. paths.add(output);
  334.  
  335. tempNumber++;
  336. }
  337.  
  338. return true;
  339.  
  340. }
  341. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement