Advertisement
Guest User

Untitled

a guest
Jul 28th, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.97 KB | None | 0 0
  1. import java.io.IOException;
  2. import java.util.*;
  3.  
  4. import javax.xml.parsers.ParserConfigurationException;
  5. import javax.xml.parsers.SAXParser;
  6. import javax.xml.parsers.SAXParserFactory;
  7.  
  8. import org.xml.sax.Attributes;
  9. import org.xml.sax.SAXException;
  10.  
  11. import org.xml.sax.helpers.DefaultHandler;
  12.  
  13. import java.sql.*;
  14.  
  15.  
  16. public class ProjectParser extends DefaultHandler{
  17. List myAuthors;
  18.  
  19. private String tempVal;
  20. //to maintain context
  21. private Document tempDoc = new Document();
  22. private String currentDoc;
  23.  
  24. static Connection conn = null;
  25.  
  26. static PreparedStatement psInsertGenre = null;
  27. static String sqlInsertGenre = null;
  28.  
  29. static PreparedStatement psInsertPeople = null;
  30. static String sqlInsertPeople = null;
  31.  
  32. static PreparedStatement psInsertTitle = null;
  33. static String sqlInsertTitle = null;
  34.  
  35. static PreparedStatement psInsertPublisher = null;
  36. static String sqlInsertPublisher = null;
  37.  
  38. static PreparedStatement psInsertDocument = null;
  39. static String sqlInsertDocument = null;
  40.  
  41. static PreparedStatement psInsertMapping = null;
  42. static String sqlInsertMapping= null;
  43.  
  44. static Hashtable<String, String> hashtable = new Hashtable<String, String>();
  45.  
  46. static int hashID = 0;
  47.  
  48.  
  49. // Document Variables
  50. static String title = null;
  51. static String startPage = null;
  52. static String endPage = null;
  53. static String year = null;
  54. static String volume = null;
  55. static String number = null;
  56. static String url = null;
  57. static String ee = null;
  58. static String cdrom = null;
  59. static String crossref = null;
  60. static String cite = null;
  61. static String isbn = null;
  62. static String series = null;
  63. static String editor = null;
  64. static String bookTitle = null;
  65. static String publisher = null;
  66. static int docCount = 0;
  67. static int docMapCount = 0;
  68. static int docID = 1;
  69. static int currentDocID = 0;
  70. static int peopleID = 0;
  71.  
  72.  
  73. public ProjectParser() {
  74. myAuthors = new ArrayList();
  75. }
  76.  
  77.  
  78. public void runExample() {
  79. parseDocument();
  80. }
  81.  
  82. private void parseDocument() {
  83.  
  84. //get a factory
  85. SAXParserFactory spf = SAXParserFactory.newInstance();
  86. try {
  87. //get a new instance of parser
  88. SAXParser sp = spf.newSAXParser();
  89.  
  90. //parse the file and also register this class for call backs
  91. sp.parse("/Users/bshiaw/Desktop/bigData/dblp-data.xml", this);
  92.  
  93. }catch(SAXException se) {
  94. se.printStackTrace();
  95. }catch(ParserConfigurationException pce) {
  96. pce.printStackTrace();
  97. }catch (IOException ie) {
  98. ie.printStackTrace();
  99. }
  100. }
  101.  
  102. /**
  103. * Iterate through the list and print
  104. * the contents
  105. */
  106. private void printData(){
  107.  
  108. // System.out.println("Number of Documents '" + myDocs.size() + "'.");
  109. //
  110. // Iterator it = myDocs.iterator();
  111. // while(it.hasNext()) {
  112. // System.out.println(it.next().toString());
  113.  
  114. }
  115.  
  116. public static void main(String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
  117. // Incorporate mySQL driver
  118. Class.forName("com.mysql.jdbc.Driver").newInstance();
  119.  
  120. // Connect to the test database
  121. try {
  122. conn = DriverManager.getConnection("jdbc:mysql:///bookdb","root", "lakers");
  123. }catch (SQLException e) {
  124. e.printStackTrace();
  125. }
  126.  
  127. int[] iNoRows = null;
  128. //For tbl_genres
  129. sqlInsertGenre = "insert into tbl_genres (genre_name) values(?)";
  130. try {
  131. psInsertGenre = conn.prepareStatement(sqlInsertGenre);
  132. }catch (SQLException e) {
  133. e.printStackTrace();
  134. }
  135.  
  136.  
  137. //For tbl_people
  138. sqlInsertPeople = "INSERT INTO tbl_people (name) values(?)";
  139. try {
  140. psInsertPeople = conn.prepareStatement(sqlInsertPeople);
  141. }catch (SQLException e) {
  142. e.printStackTrace();
  143. }
  144. //For tbl_booktitle
  145. sqlInsertTitle = "INSERT INTO tbl_booktitle (title) values(?)";
  146. try {
  147. psInsertTitle = conn.prepareStatement(sqlInsertTitle);
  148. }catch (SQLException e) {
  149. e.printStackTrace();
  150. }
  151. //For tbl_publisher
  152. sqlInsertPublisher = "INSERT INTO tbl_publisher(publisher_name) values(?)";
  153. try {
  154. psInsertPublisher = conn.prepareStatement(sqlInsertPublisher);
  155. }catch (SQLException e) {
  156. e.printStackTrace();
  157. }
  158. //For tbl_dblp_document
  159. sqlInsertDocument = "INSERT INTO tbl_dblp_document(title, start_page, end_page, year, volume, number, url, ee, cdrom, cite, crossref, isbn, series," +
  160. "editor_id, booktitle_id, publisher_id) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
  161. try {
  162. psInsertDocument = conn.prepareStatement(sqlInsertDocument);
  163. }catch (SQLException e) {
  164. e.printStackTrace();
  165. }
  166.  
  167. //For tbl_author_document_mapping
  168. sqlInsertMapping = "INSERT INTO tbl_author_document_mapping (doc_id, author_id) values(?, ?)";
  169. try {
  170. psInsertMapping = conn.prepareStatement(sqlInsertMapping);
  171. }catch (SQLException e) {
  172. e.printStackTrace();
  173. }
  174.  
  175. ProjectParser pp = new ProjectParser();
  176. pp.runExample();
  177. System.out.println("Parsing Complete.\n");
  178.  
  179. try {
  180. iNoRows = psInsertGenre.executeBatch();
  181. }catch (SQLException e) {
  182. e.printStackTrace();
  183. }
  184.  
  185. try {
  186. iNoRows = psInsertPeople.executeBatch();
  187. }catch (SQLException e) {
  188. e.printStackTrace();
  189. }
  190.  
  191. try {
  192. iNoRows = psInsertTitle.executeBatch();
  193. }catch (SQLException e) {
  194. e.printStackTrace();
  195. }
  196.  
  197. try {
  198. iNoRows = psInsertPublisher.executeBatch();
  199. }catch (SQLException e) {
  200. e.printStackTrace();
  201. }
  202.  
  203. try {
  204. iNoRows = psInsertDocument.executeBatch();
  205. }catch (SQLException e) {
  206. e.printStackTrace();
  207. }
  208.  
  209. try {
  210. iNoRows = psInsertMapping.executeBatch();
  211. }catch (SQLException e) {
  212. e.printStackTrace();
  213. }
  214.  
  215.  
  216. System.out.println("Execute Batch Complete.");
  217.  
  218. try {
  219. if(psInsertGenre!=null) psInsertGenre.close();
  220. if(psInsertPeople!=null) psInsertPeople.close();
  221. if(psInsertTitle!=null) psInsertTitle.close();
  222. if(psInsertPublisher!=null) psInsertPublisher.close();
  223. if(psInsertDocument!=null) psInsertDocument.close();
  224. if(psInsertMapping!=null) psInsertMapping.close();
  225. if(conn!=null) conn.close();
  226.  
  227. }catch (SQLException e) {
  228. e.printStackTrace();
  229. }
  230. }
  231.  
  232.  
  233.  
  234.  
  235. //Event Handlers
  236. public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
  237. //reset
  238. tempVal = "";
  239. if(qName.equalsIgnoreCase("book")) {
  240. //create a new instance of employee
  241. // tempDoc = new Document();
  242. // tempDoc.setmdate(attributes.getValue("mdate"));
  243. // tempDoc.setKey(attributes.getValue("key"));
  244. // tempDoc.setDocType("Book");
  245. currentDoc = "Book";
  246. if (! hashtable.containsValue(currentDoc)) {
  247. hashtable.put(Integer.toString(hashID), currentDoc);
  248. hashID++;
  249. try {
  250. psInsertGenre.setString(1, currentDoc);
  251. psInsertGenre.addBatch();
  252. }catch (Exception e) {
  253. e.printStackTrace();
  254. }
  255. }
  256. }
  257.  
  258. else if (qName.equalsIgnoreCase("incollection")) {
  259. // tempDoc = new Document();
  260. // tempDoc.setmdate(attributes.getValue("mdate"));
  261. // tempDoc.setKey(attributes.getValue("key"));
  262. // tempDoc.setDocType("Incollection");
  263. currentDoc = "Incollection";
  264. if (! hashtable.containsValue(currentDoc)) {
  265. hashtable.put(Integer.toString(hashID), currentDoc);
  266. hashID++;
  267. try {
  268. psInsertGenre.setString(1, currentDoc);
  269. psInsertGenre.addBatch();
  270. }catch (Exception e) {
  271. e.printStackTrace();
  272. }
  273. }
  274.  
  275. }
  276.  
  277.  
  278. else if (qName.equalsIgnoreCase("proceedings")) {
  279. // tempDoc = new Document();
  280. // tempDoc.setmdate(attributes.getValue("mdate"));
  281. // tempDoc.setKey(attributes.getValue("key"));
  282. // tempDoc.setDocType("Proceedings");
  283. currentDoc = "Proceedings";
  284. if (! hashtable.containsValue(currentDoc)) {
  285. hashtable.put(Integer.toString(hashID), currentDoc);
  286. hashID++;
  287. try {
  288. psInsertGenre.setString(1, currentDoc);
  289. psInsertGenre.addBatch();
  290. }catch (Exception e) {
  291. e.printStackTrace();
  292. }
  293. }
  294.  
  295. }
  296. else if (qName.equalsIgnoreCase("inproceedings")) {
  297. // tempDoc = new Document();
  298. // tempDoc.setmdate(attributes.getValue("mdate"));
  299. // tempDoc.setKey(attributes.getValue("key"));
  300. // tempDoc.setDocType("Inproceedings");
  301. currentDoc = "Inproceedings";
  302. if (! hashtable.containsValue(currentDoc)) {
  303. hashtable.put(Integer.toString(hashID), currentDoc);
  304. hashID++;
  305. try {
  306. psInsertGenre.setString(1, currentDoc);
  307. psInsertGenre.addBatch();
  308. }catch (Exception e) {
  309. e.printStackTrace();
  310. }
  311. }
  312.  
  313. }
  314. }
  315.  
  316.  
  317.  
  318. public void characters(char[] ch, int start, int length) throws SAXException {
  319. tempVal = new String(ch,start,length);
  320. }
  321.  
  322. public void endElement(String uri, String localName, String qName) throws SAXException {
  323.  
  324.  
  325. if (qName.equalsIgnoreCase(currentDoc)){
  326. //add it to the list
  327. try{
  328. if (title!=null) {
  329. psInsertDocument.setString(1, title);
  330. title = null;}
  331. else
  332. psInsertDocument.setString(1, "No Title");
  333.  
  334. if (startPage!=null) {
  335. psInsertDocument.setString(2, startPage);
  336. startPage = null; }
  337. else
  338. psInsertDocument.setString(2, "0000");
  339.  
  340. if (endPage!=null) {
  341. psInsertDocument.setString(3, endPage);
  342. endPage = null;
  343. }
  344. else
  345. psInsertDocument.setString(3, "0000");
  346.  
  347. if(year!=null) {
  348. psInsertDocument.setInt(4, Integer.parseInt(year));
  349. year = null;
  350. }
  351. else
  352. psInsertDocument.setInt(4, 0000);
  353.  
  354. if (volume!=null) {
  355. psInsertDocument.setString(5, volume);
  356. volume = null;
  357. }
  358. else
  359. psInsertDocument.setString(5, "0000");
  360.  
  361. if (number!=null) {
  362. psInsertDocument.setInt(6, Integer.parseInt(number));
  363. number = null;
  364. }
  365. else
  366. psInsertDocument.setInt(6, 0000);
  367.  
  368. if (url!=null) {
  369. psInsertDocument.setString(7, url);
  370. url = null;
  371. }
  372. else
  373. psInsertDocument.setString(7, "No URL");
  374.  
  375. if (ee!=null) {
  376. psInsertDocument.setString(8, ee);
  377. ee=null; }
  378. else
  379. psInsertDocument.setString(8, "No EE");
  380.  
  381. if (cdrom!=null) {
  382. psInsertDocument.setString(9, cdrom);
  383. cdrom = null; }
  384. else
  385. psInsertDocument.setString(9, "No CDRom");
  386.  
  387. if (cite!=null) {
  388. psInsertDocument.setString(10, cite);
  389. cite = null;
  390. }
  391. else
  392. psInsertDocument.setString(10, "No Cite");
  393.  
  394. if (crossref!=null) {
  395. psInsertDocument.setString(11, crossref);
  396. crossref = null;
  397. }
  398. else
  399. psInsertDocument.setString(11, "No crossref");
  400.  
  401. if (isbn!=null) {
  402. psInsertDocument.setString(12, isbn);
  403. isbn = null;
  404. }
  405. else
  406. psInsertDocument.setString(12, "No isbn");
  407.  
  408. if (series!=null) {
  409. psInsertDocument.setString(13, series);
  410. series = null;
  411. }
  412. else
  413. psInsertDocument.setString(13, "No series");
  414.  
  415.  
  416. Statement stmt = conn.createStatement();
  417. if (editor!=null) {
  418. if (editor.contains("'"))
  419. editor = editor.replaceFirst("'", "''");
  420.  
  421. ResultSet rs = stmt.executeQuery("select id from tbl_people where name = '"+editor+"'");
  422. int editorID = 0;
  423. while(rs.next())
  424. editorID = rs.getInt("id");
  425. rs.close();
  426. psInsertDocument.setInt(14, editorID);
  427. editor = null;
  428. }
  429. else
  430. psInsertDocument.setInt(14, 0000);
  431.  
  432. if(bookTitle!=null) {
  433. if (bookTitle.contains("'"))
  434. bookTitle = bookTitle.replace("'", "''");
  435.  
  436. ResultSet rs2 = stmt.executeQuery("select id from tbl_booktitle where title = '"+bookTitle+"'");
  437. int bookID = 0;
  438. while(rs2.next())
  439. bookID = rs2.getInt("id");
  440. rs2.close();
  441. psInsertDocument.setInt(15, bookID);
  442. bookTitle = null;
  443. }
  444. else
  445. psInsertDocument.setInt(15, 0000);
  446.  
  447. if(publisher!=null) {
  448. if (publisher.contains("'"))
  449. publisher = publisher.replace("'", "''");
  450.  
  451. ResultSet rs3 = stmt.executeQuery("select id from tbl_publisher where publisher_name = '"+publisher+"'");
  452. int publisherID = 0;
  453. while(rs3.next())
  454. publisherID = rs3.getInt("id");
  455. rs3.close();
  456. psInsertDocument.setInt(16, publisherID);
  457. publisher = null;
  458. }
  459. else
  460. psInsertDocument.setInt(16, 0000);
  461.  
  462.  
  463. psInsertDocument.addBatch();
  464. docCount++;
  465.  
  466. if (docCount > 6000) {
  467. try{
  468. psInsertDocument.executeBatch();
  469. psInsertDocument.clearBatch();
  470. }catch (Exception e) {
  471. e.printStackTrace();
  472. }
  473. }
  474.  
  475. if (docMapCount > 6000) {
  476. try{
  477. psInsertMapping.executeBatch();
  478. psInsertMapping.clearBatch();
  479. }catch (Exception e) {
  480. e.printStackTrace();
  481. }
  482. }
  483.  
  484. }catch (Exception e) {
  485. e.printStackTrace();
  486. }
  487. docID++;
  488. return;
  489. }
  490.  
  491.  
  492. if (qName.equalsIgnoreCase("author")) {
  493. String author = tempVal;
  494. Author a;
  495. if (! hashtable.containsValue(author)) {
  496. hashtable.put(Integer.toString(hashID), author);
  497. hashID++;
  498. peopleID++;
  499. a = new Author();
  500. a.setAuthorID(peopleID);
  501. a.setAuthorName(author);
  502. myAuthors.add(a);
  503. try {
  504. psInsertPeople.setString(1, author);
  505. psInsertPeople.addBatch();
  506. }catch (Exception e) {
  507. e.printStackTrace();
  508. }
  509. }
  510. try {
  511. psInsertMapping.setInt(1, docID);
  512. Iterator it = myAuthors.iterator();
  513. while (it.hasNext()) {
  514. Author tempAuth = (Author) it.next();
  515. if (tempAuth.getAuthorName().equals(author))
  516. psInsertMapping.setInt(2, tempAuth.getAuthorID());
  517. }
  518. psInsertMapping.addBatch();
  519. }catch (Exception e) {
  520. e.printStackTrace();
  521. }
  522.  
  523. }
  524. if (qName.equalsIgnoreCase("editor")) {
  525.  
  526. editor = tempVal;
  527. if (! hashtable.containsValue(editor)) {
  528. hashtable.put(Integer.toString(hashID), editor);
  529. hashID++;
  530. peopleID++;
  531. try {
  532. psInsertPeople.setString(1, editor);
  533. psInsertPeople.addBatch();
  534. }catch (Exception e) {
  535. e.printStackTrace();
  536. }
  537. return;
  538. }
  539. }
  540.  
  541. //tempDoc.setPublisher(tempVal);
  542.  
  543. if (qName.equalsIgnoreCase("title")) {
  544. title = tempVal;
  545. //tempDoc.setTitle(tempVal);
  546. }
  547. if (qName.equalsIgnoreCase("year")) {
  548. year = tempVal;
  549. //tempDoc.setYear(tempVal);
  550. }
  551. if (qName.equalsIgnoreCase("booktitle")) {
  552. bookTitle = tempVal;
  553. if (! hashtable.containsValue(bookTitle)) {
  554. hashtable.put(Integer.toString(hashID), bookTitle);
  555. hashID++;
  556. try {
  557. psInsertTitle.setString(1, bookTitle);
  558. psInsertTitle.addBatch();
  559. }catch (Exception e) {
  560. e.printStackTrace();
  561. }
  562. }
  563. return; }
  564.  
  565. if (qName.equalsIgnoreCase("publisher")) {
  566. publisher = tempVal;
  567. if (! hashtable.containsValue(publisher)) {
  568. hashtable.put(Integer.toString(hashID), publisher);
  569. hashID++;
  570. try {
  571. psInsertPublisher.setString(1, publisher);
  572. psInsertPublisher.addBatch();
  573. }catch (Exception e) {
  574. e.printStackTrace();
  575. }
  576. }return; }
  577. if (qName.equalsIgnoreCase("isbn")) {
  578. tempDoc.setISBN(tempVal);
  579. } if (qName.equalsIgnoreCase("url")) {
  580. url = tempVal;
  581. } if (qName.equalsIgnoreCase("pages")) {
  582. StringTokenizer st = new StringTokenizer(tempVal, "-");
  583. while(st.hasMoreTokens()) {
  584. startPage = st.nextToken();
  585. if (st.hasMoreTokens()) {
  586. endPage = st.nextToken();
  587. }
  588. }
  589. tempDoc.setPages(tempVal);
  590. } if (qName.equalsIgnoreCase("address")) {
  591. tempDoc.setAddress(tempVal);
  592. } if (qName.equalsIgnoreCase("journal")) {
  593. tempDoc.setJournal(tempVal);
  594. } if (qName.equalsIgnoreCase("volume")) {
  595. volume = tempVal;
  596. } if (qName.equalsIgnoreCase("number")) {
  597. number = tempVal;
  598. } if (qName.equalsIgnoreCase("month")) {
  599. tempDoc.setMonth(tempVal);
  600. } if (qName.equalsIgnoreCase("ee")) {
  601. ee = tempVal;
  602. } if (qName.equalsIgnoreCase("cdrom")) {
  603. cdrom = tempVal;
  604. } if (qName.equalsIgnoreCase("cite")) {
  605. cite = tempVal;
  606. } if (qName.equalsIgnoreCase("note")) {
  607. tempDoc.setNote(tempVal);
  608. } if (qName.equalsIgnoreCase("crossref")) {
  609. crossref = tempVal;
  610. } if (qName.equalsIgnoreCase("series")) {
  611. series = tempVal;
  612.  
  613. }
  614. }
  615.  
  616.  
  617. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement