Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import java.util.*;
- import javax.xml.parsers.ParserConfigurationException;
- import javax.xml.parsers.SAXParser;
- import javax.xml.parsers.SAXParserFactory;
- import org.xml.sax.Attributes;
- import org.xml.sax.SAXException;
- import org.xml.sax.helpers.DefaultHandler;
- import java.sql.*;
- public class ProjectParser extends DefaultHandler{
- List myDocs;
- private String tempVal;
- //to maintain context
- private Document tempDoc = new Document();
- private String currentDoc;
- static Connection conn = null;
- static PreparedStatement psInsertGenre = null;
- static String sqlInsertGenre = null;
- static PreparedStatement psInsertPeople = null;
- static String sqlInsertPeople = null;
- static PreparedStatement psInsertTitle = null;
- static String sqlInsertTitle = null;
- static PreparedStatement psInsertPublisher = null;
- static String sqlInsertPublisher = null;
- static PreparedStatement psInsertDocument = null;
- static String sqlInsertDocument = null;
- static Hashtable<String, String> hashtable = new Hashtable<String, String>();
- static int hashID = 0;
- // Document Variables
- static String title = null;
- static String startPage = null;
- static String endPage = null;
- static String year = null;
- static String volume = null;
- static String number = null;
- static String url = null;
- static String ee = null;
- static String cdrom = null;
- static String crossref = null;
- static String cite = null;
- static String isbn = null;
- static String series = null;
- static String editor = null;
- static String bookTitle = null;
- static String publisher = null;
- static int docCount = 0;
- public ProjectParser() {
- myDocs = new ArrayList();
- }
- public void runExample() {
- parseDocument();
- }
- private void parseDocument() {
- //get a factory
- SAXParserFactory spf = SAXParserFactory.newInstance();
- try {
- //get a new instance of parser
- SAXParser sp = spf.newSAXParser();
- //parse the file and also register this class for call backs
- sp.parse("/Users/bshiaw/Desktop/bigData/dblp-data.xml", this);
- }catch(SAXException se) {
- se.printStackTrace();
- }catch(ParserConfigurationException pce) {
- pce.printStackTrace();
- }catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- /**
- * Iterate through the list and print
- * the contents
- */
- private void printData(){
- System.out.println("Number of Documents '" + myDocs.size() + "'.");
- Iterator it = myDocs.iterator();
- while(it.hasNext()) {
- System.out.println(it.next().toString());
- }
- }
- public static void main(String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
- // Incorporate mySQL driver
- Class.forName("com.mysql.jdbc.Driver").newInstance();
- // Connect to the test database
- try {
- conn = DriverManager.getConnection("jdbc:mysql:///bookdb","root", "lakers");
- }catch (SQLException e) {
- e.printStackTrace();
- }
- int[] iNoRows = null;
- //For tbl_genres
- sqlInsertGenre = "insert into tbl_genres (genre_name) values(?)";
- try {
- psInsertGenre = conn.prepareStatement(sqlInsertGenre);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_people
- sqlInsertPeople = "INSERT INTO tbl_people (name) values(?)";
- try {
- psInsertPeople = conn.prepareStatement(sqlInsertPeople);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_booktitle
- sqlInsertTitle = "INSERT INTO tbl_booktitle (title) values(?)";
- try {
- psInsertTitle = conn.prepareStatement(sqlInsertTitle);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_publisher
- sqlInsertPublisher = "INSERT INTO tbl_publisher(publisher_name) values(?)";
- try {
- psInsertPublisher = conn.prepareStatement(sqlInsertPublisher);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- //For tbl_dblp_document
- sqlInsertDocument = "INSERT INTO tbl_dblp_document(title, start_page, end_page, year, volume, number, url, ee, cdrom, cite, crossref, isbn, series," +
- "editor_id, booktitle_id, publisher_id) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
- try {
- psInsertDocument = conn.prepareStatement(sqlInsertDocument);
- }catch (SQLException e) {
- e.printStackTrace();
- }
- ProjectParser pp = new ProjectParser();
- pp.runExample();
- System.out.println("Parsing Complete.\n");
- try {
- iNoRows = psInsertGenre.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertPeople.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertTitle.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertPublisher.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- try {
- iNoRows = psInsertDocument.executeBatch();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- System.out.println("Execute Batch Complete.");
- try {
- if(psInsertGenre!=null) psInsertGenre.close();
- if(psInsertPeople!=null) psInsertPeople.close();
- if(psInsertTitle!=null) psInsertTitle.close();
- if(psInsertPublisher!=null) psInsertPublisher.close();
- if(psInsertDocument!=null) psInsertDocument.close();
- if(conn!=null) conn.close();
- }catch (SQLException e) {
- e.printStackTrace();
- }
- }
- //Event Handlers
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
- //reset
- tempVal = "";
- if(qName.equalsIgnoreCase("book")) {
- //create a new instance of employee
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Book");
- currentDoc = "Book";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("incollection")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Incollection");
- currentDoc = "Incollection";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("proceedings")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Proceedings");
- currentDoc = "Proceedings";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- else if (qName.equalsIgnoreCase("inproceedings")) {
- // tempDoc = new Document();
- // tempDoc.setmdate(attributes.getValue("mdate"));
- // tempDoc.setKey(attributes.getValue("key"));
- // tempDoc.setDocType("Inproceedings");
- currentDoc = "Inproceedings";
- if (! hashtable.containsValue(currentDoc)) {
- hashtable.put(Integer.toString(hashID), currentDoc);
- hashID++;
- try {
- psInsertGenre.setString(1, currentDoc);
- psInsertGenre.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- }
- public void characters(char[] ch, int start, int length) throws SAXException {
- tempVal = new String(ch,start,length);
- }
- public void endElement(String uri, String localName, String qName) throws SAXException {
- if (qName.equalsIgnoreCase(currentDoc)){
- //add it to the list
- try{
- if (title!=null)
- psInsertDocument.setString(1, title);
- else
- psInsertDocument.setString(1, "No Title");
- if (startPage!=null)
- psInsertDocument.setString(2, startPage);
- else
- psInsertDocument.setString(2, "0000");
- if (endPage!=null)
- psInsertDocument.setString(3, endPage);
- else
- psInsertDocument.setString(3, "0000");
- if(year!=null)
- psInsertDocument.setInt(4, Integer.parseInt(year));
- else
- psInsertDocument.setInt(4, 0000);
- if (volume!=null)
- psInsertDocument.setInt(5, Integer.parseInt(volume));
- else
- psInsertDocument.setInt(5, 0000);
- if (number!=null)
- psInsertDocument.setInt(6, Integer.parseInt(number));
- else
- psInsertDocument.setInt(6, 0000);
- if (url!=null)
- psInsertDocument.setString(7, url);
- else
- psInsertDocument.setString(7, "No URL");
- if (ee!=null)
- psInsertDocument.setString(8, ee);
- else
- psInsertDocument.setString(8, "No EE");
- if (cdrom!=null)
- psInsertDocument.setString(9, cdrom);
- else
- psInsertDocument.setString(9, "No CDRom");
- if (cite!=null)
- psInsertDocument.setString(10, cite);
- else
- psInsertDocument.setString(10, "No Cite");
- if (crossref!=null)
- psInsertDocument.setString(11, crossref);
- else
- psInsertDocument.setString(11, "No crossref");
- if (isbn!=null)
- psInsertDocument.setString(12, isbn);
- else
- psInsertDocument.setString(12, "No isbn");
- if (series!=null)
- psInsertDocument.setString(13, series);
- else
- psInsertDocument.setString(13, "No series");
- Statement stmt = conn.createStatement();
- if (editor!=null) {
- if (editor.contains("'"))
- editor = editor.replaceFirst("'", "''");
- ResultSet rs = stmt.executeQuery("select id from tbl_people where name = '"+editor+"'");
- int editorID = 0;
- while(rs.next())
- editorID = rs.getInt("id");
- rs.close();
- psInsertDocument.setInt(14, editorID);
- editor = null;
- }
- else
- psInsertDocument.setInt(14, 0000);
- if(bookTitle!=null) {
- if (bookTitle.contains("'"))
- bookTitle = bookTitle.replace("'", "''");
- ResultSet rs2 = stmt.executeQuery("select id from tbl_booktitle where title = '"+bookTitle+"'");
- int bookID = 0;
- while(rs2.next())
- bookID = rs2.getInt("id");
- rs2.close();
- psInsertDocument.setInt(15, bookID);
- bookTitle = null;
- }
- else
- psInsertDocument.setInt(15, 0000);
- if(publisher!=null) {
- if (publisher.contains("'"))
- publisher = publisher.replace("'", "''");
- ResultSet rs3 = stmt.executeQuery("select id from tbl_publisher where publisher_name = '"+publisher+"'");
- int publisherID = 0;
- while(rs3.next())
- publisherID = rs3.getInt("id");
- rs3.close();
- psInsertDocument.setInt(16, publisherID);
- publisher = null;
- }
- else
- psInsertDocument.setInt(16, 0000);
- psInsertDocument.addBatch();
- docCount++;
- if (docCount > 300) {
- try{
- psInsertDocument.executeBatch();
- psInsertDocument.clearBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }
- if (qName.equalsIgnoreCase("author")) {
- String author = tempVal;
- if (! hashtable.containsValue(author)) {
- hashtable.put(Integer.toString(hashID), author);
- hashID++;
- try {
- psInsertPeople.setString(1, author);
- psInsertPeople.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }}
- if (qName.equalsIgnoreCase("editor")) {
- editor = tempVal;
- if (! hashtable.containsValue(editor)) {
- hashtable.put(Integer.toString(hashID), editor);
- hashID++;
- try {
- psInsertPeople.setString(1, editor);
- psInsertPeople.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- return;
- }
- }
- //tempDoc.setPublisher(tempVal);
- if (qName.equalsIgnoreCase("title")) {
- title = tempVal;
- //tempDoc.setTitle(tempVal);
- }
- if (qName.equalsIgnoreCase("year")) {
- year = tempVal;
- //tempDoc.setYear(tempVal);
- }
- if (qName.equalsIgnoreCase("booktitle")) {
- bookTitle = tempVal;
- if (! hashtable.containsValue(bookTitle)) {
- hashtable.put(Integer.toString(hashID), bookTitle);
- hashID++;
- try {
- psInsertTitle.setString(1, bookTitle);
- psInsertTitle.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }
- return; }
- if (qName.equalsIgnoreCase("publisher")) {
- publisher = tempVal;
- if (! hashtable.containsValue(publisher)) {
- hashtable.put(Integer.toString(hashID), publisher);
- hashID++;
- try {
- psInsertPublisher.setString(1, publisher);
- psInsertPublisher.addBatch();
- }catch (Exception e) {
- e.printStackTrace();
- }
- }return; }
- if (qName.equalsIgnoreCase("isbn")) {
- tempDoc.setISBN(tempVal);
- } if (qName.equalsIgnoreCase("url")) {
- url = tempVal;
- } if (qName.equalsIgnoreCase("pages")) {
- StringTokenizer st = new StringTokenizer(tempVal, "-");
- while(st.hasMoreTokens()) {
- startPage = st.nextToken();
- if (st.hasMoreTokens()) {
- endPage = st.nextToken();
- }
- }
- tempDoc.setPages(tempVal);
- } if (qName.equalsIgnoreCase("address")) {
- tempDoc.setAddress(tempVal);
- } if (qName.equalsIgnoreCase("journal")) {
- tempDoc.setJournal(tempVal);
- } if (qName.equalsIgnoreCase("volume")) {
- volume = tempVal;
- } if (qName.equalsIgnoreCase("number")) {
- number = tempVal;
- } if (qName.equalsIgnoreCase("month")) {
- tempDoc.setMonth(tempVal);
- } if (qName.equalsIgnoreCase("ee")) {
- ee = tempVal;
- } if (qName.equalsIgnoreCase("cdrom")) {
- cdrom = tempVal;
- } if (qName.equalsIgnoreCase("cite")) {
- cite = tempVal;
- } if (qName.equalsIgnoreCase("note")) {
- tempDoc.setNote(tempVal);
- } if (qName.equalsIgnoreCase("crossref")) {
- crossref = tempVal;
- } if (qName.equalsIgnoreCase("series")) {
- series = tempVal;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement