Advertisement
Guest User

Untitled

a guest
Apr 3rd, 2010
467
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 4.90 KB | None | 0 0
  1. // filename: ExternalSort.java
  2. import java.util.*;
  3. import java.io.*;
  4.  
  5. /**
  6. * Goal: offer a generic external-memory sorting program in Java.
  7. *
  8. * It must be :
  9. *  - hackable (easy to adapt)
  10. *  - scalable to large files
  11. *  - sensibly efficient.
  12. *
  13. * This software is in the public domain.
  14. *
  15. * By Daniel Lemire, April 2010
  16. * http://www.daniel-lemire.com/
  17. */
  18. public class ExternalSort {
  19.  
  20.     /**
  21.      * This will simply load the file by blocks of x rows, then
  22.      * sort them in-memory, and write the result to a bunch of
  23.      * temporary files that have to be merged later.
  24.      *
  25.      * @param file some flat  file
  26.      * @return a list of temporary flat files
  27.      */
  28.     public static List<File> sortInBatch(File file, Comparator<String> cmp) throws IOException {
  29.         List<File> files = new ArrayList<File>();
  30.         BufferedReader fbr = new BufferedReader(new FileReader(file));
  31.         long totalrowread = 0;
  32.         try{
  33.             List<String> tmplist =  new ArrayList<String>();
  34.             String line = "";
  35.             try {
  36.                 while(line != null) {
  37.                     // Gets rid of an infinite loop on overloaded machines  (P. Beaudoin)
  38.                                         if (Runtime.getRuntime().freeMemory() < 2097152 )
  39.                                                 throw new RuntimeException( "Can't free enough memory." );  
  40.                     while((Runtime.getRuntime().freeMemory() > 2097152)
  41.                     &&(   (line = fbr.readLine()) != null) ){ // as long as you have 2MB
  42.                         tmplist.add(line);
  43.                     }
  44.                     files.add(sortAndSave(tmplist,cmp));
  45.                     tmplist.clear();
  46.                                         // Makes sure the garbage collector is invoked.
  47.                                         // We hope to go back to 2MB. (P. Beaudoin)
  48.                                         Runtime.getRuntime().gc();
  49.                 }
  50.             } catch(EOFException oef) {
  51.                 if(tmplist.size()>0) {
  52.                     files.add(sortAndSave(tmplist,cmp));
  53.                     tmplist.clear();
  54.                 }
  55.             }
  56.         } finally {
  57.             fbr.close();
  58.         }
  59.         return files;
  60.     }
  61.  
  62.  
  63.     public static File sortAndSave(List<String> tmplist, Comparator<String> cmp) throws IOException  {
  64.         Collections.sort(tmplist,cmp);  //
  65.         File newtmpfile = File.createTempFile("sortInBatch", "flatfile");
  66.         newtmpfile.deleteOnExit();
  67.         BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile));
  68.         try {
  69.             for(String r : tmplist) {
  70.                 fbw.write(r);
  71.                 fbw.newLine();
  72.             }
  73.         } finally {
  74.             fbw.close();
  75.         }
  76.         return newtmpfile;
  77.     }
  78.     /**
  79.      * This merges a bunch of temporary flat files
  80.      * @param files
  81.      * @param output file
  82.          * @return The number of lines sorted. (P. Beaudoin)
  83.      */
  84.     public static int mergeSortedFiles(List<File> files, File outputfile, Comparator<String> cmp) throws IOException {
  85.         PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>();
  86.         for (File f : files) {
  87.             BinaryFileBuffer bfb = new BinaryFileBuffer(f,cmp);
  88.             pq.add(bfb);
  89.         }
  90.         BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile));
  91.         int rowcounter = 0;
  92.         try {
  93.             while(pq.size()>0) {
  94.                 BinaryFileBuffer bfb = pq.poll();
  95.                 String r = bfb.pop();
  96.                 fbw.write(r);
  97.                 fbw.newLine();
  98.                 ++rowcounter;
  99.                 if(bfb.empty()) {
  100.                     bfb.fbr.close();
  101.                     bfb.originalfile.delete();// we don't need you anymore
  102.                 } else {
  103.                     pq.add(bfb); // add it back
  104.                 }
  105.             }
  106.         } finally {
  107.             fbw.close();
  108.         }
  109.         return rowcounter;
  110.     }
  111.  
  112.     public static void main(String[] args) throws IOException {
  113.         if(args.length<2) {
  114.             System.out.println("please provide input and output file names");
  115.             return;
  116.         }
  117.         String inputfile = args[0];
  118.         String outputfile = args[1];
  119.         Comparator<String> comparator = new Comparator<String>() {
  120.             public int compare(String r1, String r2){
  121.                 return r1.compareTo(r2);}};
  122.         List<File> l = sortInBatch(new File(inputfile), comparator) ;
  123.         mergeSortedFiles(l, new File(outputfile), comparator);
  124.     }
  125. }
  126.  
  127.  
  128. class BinaryFileBuffer  implements Comparable<BinaryFileBuffer>{
  129.     public static int BUFFERSIZE = 512;
  130.     public BufferedReader fbr;
  131.     Comparator<String> mCMP;
  132.     public File originalfile;
  133.     private String cache;
  134.     private boolean empty;
  135.    
  136.     public BinaryFileBuffer(File f, Comparator<String> cmp) throws IOException {
  137.         originalfile = f;
  138.         mCMP = cmp;
  139.         fbr = new BufferedReader(new FileReader(f), 8192*BUFFERSIZE*2);
  140.         reload();
  141.     }
  142.    
  143.     public boolean empty() {
  144.         return empty;
  145.     }
  146.    
  147.     private void reload() throws IOException {
  148.         try {
  149.           if((this.cache = fbr.readLine()) == null){
  150.             empty = true;
  151.             cache = null;
  152.           }
  153.           else{
  154.             empty = false;
  155.           }
  156.       } catch(EOFException oef) {
  157.         empty = true;
  158.         cache = null;
  159.       }
  160.     }
  161.    
  162.    
  163.     public String peek() {
  164.         if(empty()) return null;
  165.         return cache.toString();
  166.     }
  167.     public String pop() throws IOException {
  168.       String answer = peek();
  169.         reload();
  170.       return answer;
  171.     }
  172.    
  173.     public int compareTo(BinaryFileBuffer b) {
  174.         return mCMP.compare(peek(), b.peek());
  175.     }
  176.    
  177.  
  178. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement