Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.commons.io.IOUtils;
- import org.archive.io.ArchiveRecord;
- import org.archive.io.warc.*;
- import org.archive.wayback.resourcestore.resourcefile.WarcResource;
- import java.io.BufferedOutputStream;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.OutputStream;
- import java.util.Iterator;
- import java.util.concurrent.atomic.AtomicInteger;
- public class Test126b {
- public static void main() throws Exception {
- File out = new java.io.File("out.warc.gz");
- OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
- WARCWriterPoolSettings settings = ...
- WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);
- File in = new java.io.File("in.warc.gz");
- WARCReader reader = WARCReaderFactory.get(in);
- Iterator<ArchiveRecord> it = reader.iterator();
- while (it.hasNext()) {
- ArchiveRecord archiveRecord = it.next();
- if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
- WARCRecord warcRecord = (WARCRecord) archiveRecord;
- WarcResource warcResource = new WarcResource(warcRecord, reader);
- warcResource.parseHeaders();
- String url = warcResource.getWarcHeaders().getUrl();
- System.out.println("+++ url: " + url);
- byte[] content = IOUtils.toByteArray(warcResource);
- String htmlPage = new String(content);
- if (htmlPage.contains("hello world")) {
- writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
- }
- }
- }
- reader.close();
- writer.close();
- }
- }
Add Comment
Please, Sign In to add comment