Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import java.util.*;
- import javax.xml.parsers.ParserConfigurationException;
- import javax.xml.parsers.SAXParser;
- import javax.xml.parsers.SAXParserFactory;
- import org.xml.sax.Attributes;
- import org.xml.sax.SAXException;
- import org.xml.sax.helpers.DefaultHandler;
- import java.sql.*;
- public class ProjectParser extends DefaultHandler{
- List myDocs;
- private String tempVal;
- //to maintain context
- private Document tempDoc = new Document();
- private String currentDoc;
- static Connection conn = null;
- static PreparedStatement psInsertGenre = null;
- static String sqlInsertGenre = null;
- static PreparedStatement psInsertPeople = null;
- static String sqlInsertPeople = null;
- static PreparedStatement psInsertTitle = null;
- static String sqlInsertTitle = null;
- static PreparedStatement psInsertPublisher = null;
- static String sqlInsertPublisher = null;
- static PreparedStatement psInsertDocument = null;
- static String sqlInsertDocument = null;
- static Hashtable<String, String> hashtable = new Hashtable<String, String>();
- static int hashID = 0;
- // Document Variables
- static String title;
- static String startPage;
- static String endPage;
- static String year;
- static String volume;
- static String number;
- static String url;
- static String ee;
- static String cdrom;
- static String crossref;
- static String cite;
- static String isbn;
- static String series;
- static String editor;
- static String bookTitle;
- static String publisher;
- public ProjectParser() {
- myDocs = new ArrayList();
- }
- public void runExample() {
- parseDocument();
- }
- private void parseDocument() {
- //get a factory
- SAXParserFactory spf = SAXParserFactory.newInstance();
- try {
- //get a new instance of parser
- SAXParser sp = spf.newSAXParser();
- //parse the file and also register this class for call backs
- sp.parse("/Users/bshiaw/Desktop/bigData/dblp-data.xml", this);
- }catch(SAXException se) {
- se.printStackTrace();
- }catch(ParserConfigurationException pce) {
- pce.printStackTrace();
- }catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- /**
- * Iterate through the list and print
- * the contents
- */
- private void printData(){
- System.out.println("Number of Documents '" + myDocs.size() + "'.");
- Iterator it = myDocs.iterator();
- while(it.hasNext()) {
- System.out.println(it.next().toString());
- }
- }
- public static void main(String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
- // Incorporate mySQL driver
- Class.forName("com.mysql.jdbc.Driver").newInstance();
- // Connect to the test database
- try {
- conn = DriverManager.getConnection("jdbc:mysql:///bookdb","root", "lakers");
- }catch (SQLException e) {
- e.printStackTrace();
- }
- int[] iNoRows = null;
- //For tbl_genres
- sqlInsertGenre = "insert into tbl_genres (genre_name) values(?)";
- try {
- psInsertGenre = conn.prepareStatement(sqlInsertGenre);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_people
- sqlInsertPeople = "INSERT INTO tbl_people (name) values(?)";
- try {
- psInsertPeople = conn.prepareStatement(sqlInsertPeople);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_booktitle
- sqlInsertTitle = "INSERT INTO tbl_booktitle (title) values(?)";
- try {
- psInsertTitle = conn.prepareStatement(sqlInsertTitle);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_publisher
- sqlInsertPublisher = "INSERT INTO tbl_publisher(publisher_name) values(?)";
- try {
- psInsertPublisher = conn.prepareStatement(sqlInsertPublisher);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_dblp_document
- sqlInsertDocument = "INSERT INTO tbl_dblp_document(title, start_page, end_page, year, volume, number, url, ee, cdrom, cite, crossref, isbn, series," +
- "editor_id, booktitle_id, publisher_id values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
- try {
- psInsertDocument = conn.prepareStatement(sqlInsertDocument);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- ProjectParser pp = new ProjectParser();
- pp.runExample();
- System.out.println("Parsing Complete.\n");
- try {
- iNoRows = psInsertGenre.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertPeople.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertTitle.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertPublisher.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertDocument.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- System.out.println("Execute Batch Complete.");
- try {
- if(psInsertGenre!=null) psInsertGenre.close();
- if(psInsertPeople!=null) psInsertPeople.close();
- if(psInsertTitle!=null) psInsertTitle.close();
- if(psInsertPublisher!=null) psInsertPublisher.close();
- if(psInsertDocument!=null) psInsertDocument.close();
- if(conn!=null) conn.close();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- }
- //Event Handlers
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
- //reset
- tempVal = "";
- if(qName.equalsIgnoreCase("book")) {
- //create a new instance of employee
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Book");
- currentDoc = "Book";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("incollection")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Incollection");
- currentDoc = "Incollection";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("proceedings")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Proceedings");
- currentDoc = "Proceedings";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("inproceedings")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Inproceedings");
- currentDoc = "Inproceedings";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- }
- public void characters(char[] ch, int start, int length) throws SAXException {
- tempVal = new String(ch,start,length);
- }
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (qName.equalsIgnoreCase(currentDoc)){
- //add it to the list
- try{
- psInsertDocument.setString(1, title);
- psInsertDocument.setString(2, startPage);
- psInsertDocument.setString(3, endPage);
- psInsertDocument.setString(4, year);
- psInsertDocument.setString(5, volume);
- psInsertDocument.setString(6, number);
- psInsertDocument.setString(7, url);
- psInsertDocument.setString(8, ee);
- psInsertDocument.setString(9, cdrom);
- psInsertDocument.setString(10, cite);
- psInsertDocument.setString(11, crossref);
- psInsertDocument.setString(12, isbn);
- psInsertDocument.setString(13, series);
- Statement stmt = conn.createStatement();
- if (editor.contains('))
- editor.replaceFirst("'", "\"");
- ResultSet rs = stmt.executeQuery("select id from tbl_people where name = '"+editor+"'");
- String editorID = "";
- while(rs.next())
- editorID = rs.getString("id");
- rs.close();
- psInsertDocument.setString(14, editorID);
- if (bookTitle.contains("'"))
- bookTitle.replaceFirst("'", "\"");
- ResultSet rs2 = stmt.executeQuery("select id from tbl_booktitle where title = '"+bookTitle+"'");
- String bookID = "";
- while(rs2.next())
- bookID = rs2.getString("id");
- rs2.close();
- psInsertDocument.setString(15, bookID);
- if (publisher.contains("'"))
- publisher.replaceFirst("'", "\"");
- ResultSet rs3 = stmt.executeQuery("select id from tbl_publisher where publisher_name = '"+publisher+"'");
- String publisherID = "";
- while(rs3.next())
- publisherID = rs3.getString("id");
- rs3.close();
- psInsertDocument.setString(16, publisherID);
- psInsertDocument.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }
- if (qName.equalsIgnoreCase("author")) {
- String author = tempVal;
- if (! hashtable.containsValue(author)) {
- hashtable.put(Integer.toString(hashID), author);
- hashID++;
- try {
- psInsertPeople.setString(1, author);
- psInsertPeople.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }}
- if (qName.equalsIgnoreCase("editor")) {
- editor = tempVal;
- if (! hashtable.containsValue(editor)) {
- hashtable.put(Integer.toString(hashID), editor);
- hashID++;
- try {
- psInsertPeople.setString(1, editor);
- psInsertPeople.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }
- }
- //tempDoc.setPublisher(tempVal);
- if (qName.equalsIgnoreCase("title")) {
- title = tempVal;
- //tempDoc.setTitle(tempVal);
- }
- if (qName.equalsIgnoreCase("year")) {
- year = tempVal;
- //tempDoc.setYear(tempVal);
- }
- if (qName.equalsIgnoreCase("booktitle")) {
- bookTitle = tempVal;
- if (! hashtable.containsValue(bookTitle)) {
- hashtable.put(Integer.toString(hashID), bookTitle);
- hashID++;
- try {
- psInsertTitle.setString(1, bookTitle);
- psInsertTitle.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- return; }
- if (qName.equalsIgnoreCase("publisher")) {
- publisher = tempVal;
- if (! hashtable.containsValue(publisher)) {
- hashtable.put(Integer.toString(hashID), publisher);
- hashID++;
- try {
- psInsertPublisher.setString(1, publisher);
- psInsertPublisher.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }return; }
- if (qName.equalsIgnoreCase("isbn")) {
- tempDoc.setISBN(tempVal);
- } if (qName.equalsIgnoreCase("url")) {
- url = tempVal;
- } if (qName.equalsIgnoreCase("pages")) {
- StringTokenizer st = new StringTokenizer(tempVal, "-");
- while(st.hasMoreTokens()) {
- startPage = st.nextToken();
- if (st.hasMoreTokens()) {
- endPage = st.nextToken();
- }
- }
- tempDoc.setPages(tempVal);
- } if (qName.equalsIgnoreCase("address")) {
- tempDoc.setAddress(tempVal);
- } if (qName.equalsIgnoreCase("journal")) {
- tempDoc.setJournal(tempVal);
- } if (qName.equalsIgnoreCase("volume")) {
- volume = tempVal;
- } if (qName.equalsIgnoreCase("number")) {
- number = tempVal;
- tempDoc.setNumber(tempVal);
- } if (qName.equalsIgnoreCase("month")) {
- tempDoc.setMonth(tempVal);
- } if (qName.equalsIgnoreCase("ee")) {
- ee = tempVal;
- } if (qName.equalsIgnoreCase("cdrom")) {
- cdrom = tempVal;
- } if (qName.equalsIgnoreCase("cite")) {
- cite = tempVal;
- } if (qName.equalsIgnoreCase("note")) {
- tempDoc.setNote(tempVal);
- } if (qName.equalsIgnoreCase("crossref")) {
- crossref = tempVal;
- } if (qName.equalsIgnoreCase("series")) {
- series = tempVal;
- } if (qName.equalsIgnoreCase("school")) {
- tempDoc.setSchool(tempVal);
- } if (qName.equalsIgnoreCase("chapter")) {
- tempDoc.setChapter(tempVal);
- }
- // }
- // else if (qName.equalsIgnoreCase("editor")) {
- //// tempDoc.setEditor(tempVal);
- // String editor = tempVal;
- // if (! hashtable.containsValue(editor)) {
- // hashtable.put(Integer.toString(hashID), editor);
- // hashID++;
- // try {
- // psInsertPeople.setString(1, editor+" Editor");
- // psInsertPeople.addBatch();
- // }catch (Exception e) {
- // e.printStackTrace();
- // }
- // }
- // }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement