Advertisement
Guest User

Untitled

a guest
Jun 25th, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.06 KB | None | 0 0
  1. package clustering.preProcessing;
  2.  
  3. import clustering.FileDocument;
  4.  
  5. import java.io.BufferedReader;
  6. import java.io.File;
  7. import java.io.FileNotFoundException;
  8. import java.io.FileReader;
  9. import java.util.regex.Matcher;
  10. import java.util.regex.Pattern;
  11.  
  12. /**
  13. * Created by IntelliJ IDEA.
  14. * User: ian.macgillivray
  15. * Date: Nov 4, 2010
  16. * Time: 12:09:15 PM
  17. * To change this template use File | Settings | File Templates.
  18. */
  19. public class RNSArchiveFirstProcessing
  20. {
  21.  
  22. private static void splitBodiesIntoFileSystem(String bodiesFile)
  23. {
  24. int count = 0;
  25. String outputDir = "C:\\Users\\Administrator\\Documents\\rna_2006_2009\\Archive\\";
  26. String countDir = "0\\";
  27.  
  28. // Have to use our own reader to avoid memory problems - giant file
  29. BufferedReader br = null;
  30. try {
  31. br = new BufferedReader(new FileReader(bodiesFile));
  32.  
  33. String line = br.readLine();
  34. while(line != null)
  35. {
  36. if(count > 106170){
  37. if(count % 10000 == 0){
  38. countDir = count+"\\";
  39. File file = new File(outputDir + countDir); file.mkdir();
  40. System.out.println("Processed " + count + " files.");
  41. }
  42. FileDocument.outputFile(outputDir + countDir + line.split("\\s+")[0].replaceAll("[\\\\\\/\\:\\*\\?\\\"\\<\\>]", "") + ".txt", addNewlines(removeHeaders(line)));
  43. }
  44.  
  45. line = br.readLine();
  46. count++; System.out.println(count);
  47. }
  48. }
  49. catch(Exception e)
  50. {
  51. System.out.println(e);
  52. }
  53. }
  54.  
  55. private static String removeHeaders(String document)
  56. {
  57. return document.replaceAll("^.*\\s+.*?Reuters\\)\\s\\-\\s", "");
  58. }
  59.  
  60. private static String addNewlines(String document)
  61. {
  62. return document.replaceAll(" ", "\n\n");
  63. }
  64.  
  65. public static void main(String[] args)
  66. {
  67. splitBodiesIntoFileSystem("C:\\Users\\Administrator\\Documents\\rna_2006_2009\\Silver235_2006_2009_bodies.txt");
  68. }
  69. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement