Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package clustering.preProcessing;
- import clustering.FileDocument;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileReader;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * Created by IntelliJ IDEA.
- * User: ian.macgillivray
- * Date: Nov 4, 2010
- * Time: 12:09:15 PM
- * To change this template use File | Settings | File Templates.
- */
- public class RNSArchiveFirstProcessing
- {
- private static void splitBodiesIntoFileSystem(String bodiesFile)
- {
- int count = 0;
- String outputDir = "C:\\Users\\Administrator\\Documents\\rna_2006_2009\\Archive\\";
- String countDir = "0\\";
- // Have to use our own reader to avoid memory problems - giant file
- BufferedReader br = null;
- try {
- br = new BufferedReader(new FileReader(bodiesFile));
- String line = br.readLine();
- while(line != null)
- {
- if(count > 106170){
- if(count % 10000 == 0){
- countDir = count+"\\";
- File file = new File(outputDir + countDir); file.mkdir();
- System.out.println("Processed " + count + " files.");
- }
- FileDocument.outputFile(outputDir + countDir + line.split("\\s+")[0].replaceAll("[\\\\\\/\\:\\*\\?\\\"\\<\\>]", "") + ".txt", addNewlines(removeHeaders(line)));
- }
- line = br.readLine();
- count++; System.out.println(count);
- }
- }
- catch(Exception e)
- {
- System.out.println(e);
- }
- }
- private static String removeHeaders(String document)
- {
- return document.replaceAll("^.*\\s+.*?Reuters\\)\\s\\-\\s", "");
- }
- private static String addNewlines(String document)
- {
- return document.replaceAll(" ", "\n\n");
- }
- public static void main(String[] args)
- {
- splitBodiesIntoFileSystem("C:\\Users\\Administrator\\Documents\\rna_2006_2009\\Silver235_2006_2009_bodies.txt");
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement