Advertisement
Guest User

Philippe Beaudoin

a guest
Apr 1st, 2010
1,007
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 5.24 KB | None | 0 0
  1. // filename: ExternalSort.java
  2. import java.util.*;
  3. import java.io.*;
  4.  
  5. /**
  6. * Goal: offer a generic external-memory sorting program in Java.
  7. *
  8. * It must be :
  9. *  - hackable (easy to adapt)
  10. *  - scalable to large files
  11. *  - sensibly efficient.
  12. *
  13. * This software is in the public domain.
  14. *
  15. * By Daniel Lemire, April 2010
  16. * http://www.daniel-lemire.com/
  17. */
  18. public class ExternalSort {
  19.  
  20.     /**
  21.      * This will simply load the file by blocks of x rows, then
  22.      * sort them in-memory, and write the result to a bunch of
  23.      * temporary files that have to be merged later.
  24.      *
  25.      * @param file some flat  file
  26.      * @return a list of temporary flat files
  27.      */
  28.     public static List<File> sortInBatch(File file, Comparator<String> cmp) throws IOException {
  29.         List<File> files = new Vector<File>();
  30.         BufferedReader fbr = new BufferedReader(new FileReader(file));
  31.         long totalrowread = 0;
  32.         try{
  33.             List<String> tmplist =  new Vector<String>();
  34.             String line = "";
  35.             try {
  36.                 while(line != null) {
  37.                     // Gets rid of an infinite loop on overloaded machines  (P. Beaudoin)
  38.                                         if (Runtime.getRuntime().freeMemory() < 2097152 )
  39.                                                 throw new RuntimeException( "Can't free enough memory." );  
  40.                     //tmplist = new Vector<String>(); // You already create this outside the loop,
  41.                                                                           // Then you write it out to a file. No need
  42.                                                                           // to construct it every time. (P. Beaudoin)
  43.                     while((Runtime.getRuntime().freeMemory() > 2097152)
  44.                     &&(   (line = fbr.readLine()) != null) ){ // as long as you have 2MB
  45.                         tmplist.add(line);
  46.                     }
  47.                     files.add(sortAndSave(tmplist,cmp));
  48.                     tmplist.clear();
  49.                                         // Makes sure the garbage collector is invoked.
  50.                                         // We hope to go back to 2MB. (P. Beaudoin)
  51.                                         Runtime.getRuntime().gc();
  52.                 }
  53.             } catch(EOFException oef) {
  54.                 if(tmplist.size()>0) {
  55.                     files.add(sortAndSave(tmplist,cmp));
  56.                     tmplist.clear();
  57.                 }
  58.             }
  59.         } finally {
  60.             fbr.close();
  61.         }
  62.         return files;
  63.     }
  64.  
  65.  
  66.     public static File sortAndSave(List<String> tmplist, Comparator<String> cmp) throws IOException  {
  67.         Collections.sort(tmplist,cmp);  //
  68.         File newtmpfile = File.createTempFile("sortInBatch", "flatfile");
  69.         newtmpfile.deleteOnExit();
  70.         BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile));
  71.         try {
  72.             for(String r : tmplist) {
  73.                 fbw.write(r);
  74.                 fbw.newLine();
  75.             }
  76.         } finally {
  77.             fbw.close();
  78.         }
  79.         return newtmpfile;
  80.     }
  81.     /**
  82.      * This merges a bunch of temporary flat files
  83.      * @param files
  84.      * @param output file
  85.          * @return The number of lines sorted. (P. Beaudoin)
  86.      */
  87.     public static int mergeSortedFiles(List<File> files, File outputfile, Comparator<String> cmp) throws IOException {
  88.         PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>();
  89.         for (File f : files) {
  90.             BinaryFileBuffer bfb = new BinaryFileBuffer(f,cmp);
  91.             pq.add(bfb);
  92.         }
  93.         BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile));
  94.         int rowcounter = 0;
  95.         try {
  96.             while(pq.size()>0) {
  97.                 BinaryFileBuffer bfb = pq.poll();
  98.                 String r = bfb.pop();
  99.                 fbw.write(r);
  100.                 fbw.newLine();
  101.                 ++rowcounter;
  102.                 if(bfb.empty()) {
  103.                     bfb.fbr.close();
  104.                     bfb.originalfile.delete();// we don't need you anymore
  105.                 } else {
  106.                     pq.add(bfb); // add it back
  107.                 }
  108.             }
  109.         } finally {
  110.             fbw.close();
  111.         }
  112.         return rowcounter;
  113.     }
  114.  
  115.     public static void main(String[] args) throws IOException {
  116.         if(args.length<2) {
  117.             System.out.println("please provide input and output file names");
  118.             return;
  119.         }
  120.         String inputfile = args[0];
  121.         String outputfile = args[1];
  122.         Comparator<String> comparator = new Comparator<String>() {
  123.             public int compare(String r1, String r2){
  124.                 return r1.compareTo(r2);}};
  125.         List<File> l = sortInBatch(new File(inputfile), comparator) ;
  126.         mergeSortedFiles(l, new File(outputfile), comparator);
  127.     }
  128. }
  129.  
  130.  
  131. class BinaryFileBuffer  implements Comparable<BinaryFileBuffer>{
  132.     public static int BUFFERSIZE = 512;
  133.     public BufferedReader fbr;
  134.     private List<String> buf = new Vector<String>();
  135.     int currentpointer = 0;
  136.     Comparator<String> mCMP;
  137.     public File originalfile;
  138.    
  139.     public BinaryFileBuffer(File f, Comparator<String> cmp) throws IOException {
  140.         originalfile = f;
  141.         mCMP = cmp;
  142.         fbr = new BufferedReader(new FileReader(f));
  143.         reload();
  144.     }
  145.    
  146.     public boolean empty() {
  147.         return buf.size()==0;
  148.     }
  149.    
  150.     private void reload() throws IOException {
  151.           buf.clear();
  152.           try {
  153.               String line;
  154.               while((buf.size()<BUFFERSIZE) && ((line = fbr.readLine()) != null))
  155.                 buf.add(line);
  156.             } catch(EOFException oef) {
  157.             }      
  158.     }
  159.    
  160.    
  161.     public String peek() {
  162.         if(empty()) return null;
  163.         return buf.get(currentpointer);
  164.     }
  165.     public String pop() throws IOException {
  166.       String answer = peek();
  167.       ++currentpointer;
  168.       if(currentpointer == buf.size()) {
  169.           reload();
  170.           currentpointer = 0;
  171.       }
  172.       return answer;
  173.     }
  174.    
  175.     public int compareTo(BinaryFileBuffer b) {
  176.         return mCMP.compare(peek(), b.peek());
  177.     }
  178.    
  179.  
  180. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement