package com.datadreamer.whitehouse.petition;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class PetitionParser {
public void loadAndParse(String filename){
try{
// grab html from file
FileInputStream fstream = new FileInputStream(filename);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
StringBuffer buffer = new StringBuffer();
while((strLine = br.readLine()) != null){
buffer.append(strLine);
}
in.close();
// parse html for signature data
parse(buffer.toString());
} catch(Exception e){
e.printStackTrace();
}
}
private void parse(String html){
try{
Document doc = Jsoup.parse(html);
Elements signatureDivs = doc.select("div[class^=entry]");
int sigCount = 0;
BufferedWriter dataFile = new BufferedWriter(new FileWriter(new File("data/signatures.txt"), true));
// iterate over every signature div
for(Element sigDiv : signatureDivs){
String name = sigDiv.getElementsByClass("name").html().trim();
String details = sigDiv.getElementsByClass("details").html();
String[] detailLines = details.split("\n");
String place = "";
String date = "";
String sigNum = "";
// grab the content from the signature divs
if(detailLines.length == 2){
date = detailLines[0].replace("<br />", "").trim();
sigNum = detailLines[1].split("# ")[1].trim();
sigCount++;
} else if (detailLines.length == 3){
place = detailLines[0];
date = detailLines[1].replace("<br />", "").trim();
sigNum = detailLines[2].split("# ")[1].trim();
sigCount++;
}
// format the data for each line using tab seperated values
String signatureData = sigNum +"\t"+ name +"\t"+ place +"\t"+ date;
//System.out.println(signatureData);
dataFile.write(signatureData);
dataFile.newLine();
}
dataFile.close();
System.out.println(sigCount +" signatures");
} catch(Exception e){
e.printStackTrace();
}
}
static public void main(String[] args){
PetitionParser pp = new PetitionParser();
pp.loadAndParse("data/sample.html");
}
}