Guest User

Untitled

a guest
Jun 22nd, 2018
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.70 KB | None | 0 0
  1. import org.apache.commons.io.IOUtils;
  2. import org.archive.io.ArchiveRecord;
  3. import org.archive.io.warc.*;
  4. import org.archive.wayback.resourcestore.resourcefile.WarcResource;
  5.  
  6. import java.io.BufferedOutputStream;
  7. import java.io.File;
  8. import java.io.FileOutputStream;
  9. import java.io.OutputStream;
  10. import java.util.Iterator;
  11. import java.util.concurrent.atomic.AtomicInteger;
  12.  
  13. public class Test126b {
  14. public static void main() throws Exception {
  15. File out = new java.io.File("out.warc.gz");
  16. OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
  17. WARCWriterPoolSettings settings = ...
  18. WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);
  19.  
  20. File in = new java.io.File("in.warc.gz");
  21. WARCReader reader = WARCReaderFactory.get(in);
  22. Iterator<ArchiveRecord> it = reader.iterator();
  23.  
  24. while (it.hasNext()) {
  25. ArchiveRecord archiveRecord = it.next();
  26. if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
  27. WARCRecord warcRecord = (WARCRecord) archiveRecord;
  28. WarcResource warcResource = new WarcResource(warcRecord, reader);
  29. warcResource.parseHeaders();
  30. String url = warcResource.getWarcHeaders().getUrl();
  31. System.out.println("+++ url: " + url);
  32. byte[] content = IOUtils.toByteArray(warcResource);
  33.  
  34. String htmlPage = new String(content);
  35. if (htmlPage.contains("hello world")) {
  36. writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
  37. }
  38. }
  39. }
  40.  
  41. reader.close();
  42. writer.close();
  43. }
  44. }
Add Comment
Please, Sign In to add comment