Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.datadreamer.whitehouse.petition;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.DataInputStream;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileWriter;
- import java.io.InputStreamReader;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class PetitionParser {
- public void loadAndParse(String filename){
- try{
- // grab html from file
- FileInputStream fstream = new FileInputStream(filename);
- DataInputStream in = new DataInputStream(fstream);
- BufferedReader br = new BufferedReader(new InputStreamReader(in));
- String strLine;
- StringBuffer buffer = new StringBuffer();
- while((strLine = br.readLine()) != null){
- buffer.append(strLine);
- }
- in.close();
- // parse html for signature data
- parse(buffer.toString());
- } catch(Exception e){
- e.printStackTrace();
- }
- }
- private void parse(String html){
- try{
- Document doc = Jsoup.parse(html);
- Elements signatureDivs = doc.select("div[class^=entry]");
- int sigCount = 0;
- BufferedWriter dataFile = new BufferedWriter(new FileWriter(new File("data/signatures.txt"), true));
- // iterate over every signature div
- for(Element sigDiv : signatureDivs){
- String name = sigDiv.getElementsByClass("name").html().trim();
- String details = sigDiv.getElementsByClass("details").html();
- String[] detailLines = details.split("\n");
- String place = "";
- String date = "";
- String sigNum = "";
- // grab the content from the signature divs
- if(detailLines.length == 2){
- date = detailLines[0].replace("<br />", "").trim();
- sigNum = detailLines[1].split("# ")[1].trim();
- sigCount++;
- } else if (detailLines.length == 3){
- place = detailLines[0];
- date = detailLines[1].replace("<br />", "").trim();
- sigNum = detailLines[2].split("# ")[1].trim();
- sigCount++;
- }
- // format the data for each line using tab seperated values
- String signatureData = sigNum +"\t"+ name +"\t"+ place +"\t"+ date;
- //System.out.println(signatureData);
- dataFile.write(signatureData);
- dataFile.newLine();
- }
- dataFile.close();
- System.out.println(sigCount +" signatures");
- } catch(Exception e){
- e.printStackTrace();
- }
- }
- static public void main(String[] args){
- PetitionParser pp = new PetitionParser();
- pp.loadAndParse("data/sample.html");
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement