Advertisement
Guest User

Untitled

a guest
Jul 28th, 2017
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.25 KB | None | 0 0
  1. import java.io.IOException;
  2. import java.util.*;
  3.  
  4. import javax.xml.parsers.ParserConfigurationException;
  5. import javax.xml.parsers.SAXParser;
  6. import javax.xml.parsers.SAXParserFactory;
  7.  
  8. import org.xml.sax.Attributes;
  9. import org.xml.sax.SAXException;
  10.  
  11. import org.xml.sax.helpers.DefaultHandler;
  12.  
  13. import java.sql.*;
  14.  
  15.  
  16. public class ProjectParser extends DefaultHandler{
  17. List myDocs;
  18.  
  19. private String tempVal;
  20. //to maintain context
  21. private Document tempDoc = new Document();
  22. private String currentDoc;
  23.  
  24. static Connection conn = null;
  25.  
  26. static PreparedStatement psInsertGenre = null;
  27. static String sqlInsertGenre = null;
  28.  
  29. static PreparedStatement psInsertPeople = null;
  30. static String sqlInsertPeople = null;
  31.  
  32. static PreparedStatement psInsertTitle = null;
  33. static String sqlInsertTitle = null;
  34.  
  35. static PreparedStatement psInsertPublisher = null;
  36. static String sqlInsertPublisher = null;
  37.  
  38. static PreparedStatement psInsertDocument = null;
  39. static String sqlInsertDocument = null;
  40.  
  41. static Hashtable<String, String> hashtable = new Hashtable<String, String>();
  42. static int hashID = 0;
  43.  
  44.  
  45. // Document Variables
  46. static String title = null;
  47. static String startPage = null;
  48. static String endPage = null;
  49. static String year = null;
  50. static String volume = null;
  51. static String number = null;
  52. static String url = null;
  53. static String ee = null;
  54. static String cdrom = null;
  55. static String crossref = null;
  56. static String cite = null;
  57. static String isbn = null;
  58. static String series = null;
  59. static String editor = null;
  60. static String bookTitle = null;
  61. static String publisher = null;
  62. static int docCount = 0;
  63.  
  64.  
  65. public ProjectParser() {
  66. myDocs = new ArrayList();
  67. }
  68.  
  69.  
  70. public void runExample() {
  71. parseDocument();
  72. }
  73.  
  74. private void parseDocument() {
  75.  
  76. //get a factory
  77. SAXParserFactory spf = SAXParserFactory.newInstance();
  78. try {
  79. //get a new instance of parser
  80. SAXParser sp = spf.newSAXParser();
  81.  
  82. //parse the file and also register this class for call backs
  83. sp.parse("/Users/bshiaw/Desktop/bigData/dblp-data.xml", this);
  84.  
  85. }catch(SAXException se) {
  86. se.printStackTrace();
  87. }catch(ParserConfigurationException pce) {
  88. pce.printStackTrace();
  89. }catch (IOException ie) {
  90. ie.printStackTrace();
  91. }
  92. }
  93.  
  94. /**
  95. * Iterate through the list and print
  96. * the contents
  97. */
  98. private void printData(){
  99.  
  100. System.out.println("Number of Documents '" + myDocs.size() + "'.");
  101.  
  102. Iterator it = myDocs.iterator();
  103. while(it.hasNext()) {
  104. System.out.println(it.next().toString());
  105. }
  106. }
  107.  
  108. public static void main(String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
  109. // Incorporate mySQL driver
  110. Class.forName("com.mysql.jdbc.Driver").newInstance();
  111.  
  112. // Connect to the test database
  113. try {
  114. conn = DriverManager.getConnection("jdbc:mysql:///bookdb","root", "lakers");
  115. }catch (SQLException e) {
  116. e.printStackTrace();
  117. }
  118.  
  119. int[] iNoRows = null;
  120. //For tbl_genres
  121. sqlInsertGenre = "insert into tbl_genres (genre_name) values(?)";
  122. try {
  123. psInsertGenre = conn.prepareStatement(sqlInsertGenre);
  124. }catch (SQLException e) {
  125. e.printStackTrace();
  126. }
  127.  
  128.  
  129. //For tbl_people
  130. sqlInsertPeople = "INSERT INTO tbl_people (name) values(?)";
  131. try {
  132. psInsertPeople = conn.prepareStatement(sqlInsertPeople);
  133. }catch (SQLException e) {
  134. e.printStackTrace();
  135. }
  136. //For tbl_booktitle
  137. sqlInsertTitle = "INSERT INTO tbl_booktitle (title) values(?)";
  138. try {
  139. psInsertTitle = conn.prepareStatement(sqlInsertTitle);
  140. }catch (SQLException e) {
  141. e.printStackTrace();
  142. }
  143. //For tbl_publisher
  144. sqlInsertPublisher = "INSERT INTO tbl_publisher(publisher_name) values(?)";
  145. try {
  146. psInsertPublisher = conn.prepareStatement(sqlInsertPublisher);
  147. }catch (SQLException e) {
  148. e.printStackTrace();
  149. }
  150. //For tbl_dblp_document
  151. sqlInsertDocument = "INSERT INTO tbl_dblp_document(title, start_page, end_page, year, volume, number, url, ee, cdrom, cite, crossref, isbn, series," +
  152. "editor_id, booktitle_id, publisher_id) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
  153. try {
  154. psInsertDocument = conn.prepareStatement(sqlInsertDocument);
  155. }catch (SQLException e) {
  156. e.printStackTrace();
  157. }
  158.  
  159. ProjectParser pp = new ProjectParser();
  160. pp.runExample();
  161. System.out.println("Parsing Complete.\n");
  162.  
  163. try {
  164. iNoRows = psInsertGenre.executeBatch();
  165. }catch (SQLException e) {
  166. e.printStackTrace();
  167. }
  168.  
  169. try {
  170. iNoRows = psInsertPeople.executeBatch();
  171. }catch (SQLException e) {
  172. e.printStackTrace();
  173. }
  174.  
  175. try {
  176. iNoRows = psInsertTitle.executeBatch();
  177. }catch (SQLException e) {
  178. e.printStackTrace();
  179. }
  180.  
  181. try {
  182. iNoRows = psInsertPublisher.executeBatch();
  183. }catch (SQLException e) {
  184. e.printStackTrace();
  185. }
  186.  
  187. try {
  188. iNoRows = psInsertDocument.executeBatch();
  189. }catch (SQLException e) {
  190. e.printStackTrace();
  191. }
  192.  
  193.  
  194. System.out.println("Execute Batch Complete.");
  195.  
  196. try {
  197. if(psInsertGenre!=null) psInsertGenre.close();
  198. if(psInsertPeople!=null) psInsertPeople.close();
  199. if(psInsertTitle!=null) psInsertTitle.close();
  200. if(psInsertPublisher!=null) psInsertPublisher.close();
  201. if(psInsertDocument!=null) psInsertDocument.close();
  202. if(conn!=null) conn.close();
  203.  
  204. }catch (SQLException e) {
  205. e.printStackTrace();
  206. }
  207. }
  208.  
  209.  
  210.  
  211. //Event Handlers
  212. public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
  213. //reset
  214. tempVal = "";
  215.  
  216. if(qName.equalsIgnoreCase("book")) {
  217. //create a new instance of employee
  218. // tempDoc = new Document();
  219. // tempDoc.setmdate(attributes.getValue("mdate"));
  220. // tempDoc.setKey(attributes.getValue("key"));
  221. // tempDoc.setDocType("Book");
  222. currentDoc = "Book";
  223. if (! hashtable.containsValue(currentDoc)) {
  224. hashtable.put(Integer.toString(hashID), currentDoc);
  225. hashID++;
  226. try {
  227. psInsertGenre.setString(1, currentDoc);
  228. psInsertGenre.addBatch();
  229. }catch (Exception e) {
  230. e.printStackTrace();
  231. }
  232. }
  233. }
  234.  
  235. else if (qName.equalsIgnoreCase("incollection")) {
  236. // tempDoc = new Document();
  237. // tempDoc.setmdate(attributes.getValue("mdate"));
  238. // tempDoc.setKey(attributes.getValue("key"));
  239. // tempDoc.setDocType("Incollection");
  240. currentDoc = "Incollection";
  241. if (! hashtable.containsValue(currentDoc)) {
  242. hashtable.put(Integer.toString(hashID), currentDoc);
  243. hashID++;
  244. try {
  245. psInsertGenre.setString(1, currentDoc);
  246. psInsertGenre.addBatch();
  247. }catch (Exception e) {
  248. e.printStackTrace();
  249. }
  250. }
  251. }
  252.  
  253.  
  254. else if (qName.equalsIgnoreCase("proceedings")) {
  255. // tempDoc = new Document();
  256. // tempDoc.setmdate(attributes.getValue("mdate"));
  257. // tempDoc.setKey(attributes.getValue("key"));
  258. // tempDoc.setDocType("Proceedings");
  259. currentDoc = "Proceedings";
  260. if (! hashtable.containsValue(currentDoc)) {
  261. hashtable.put(Integer.toString(hashID), currentDoc);
  262. hashID++;
  263. try {
  264. psInsertGenre.setString(1, currentDoc);
  265. psInsertGenre.addBatch();
  266. }catch (Exception e) {
  267. e.printStackTrace();
  268. }
  269. }
  270.  
  271. }
  272. else if (qName.equalsIgnoreCase("inproceedings")) {
  273. // tempDoc = new Document();
  274. // tempDoc.setmdate(attributes.getValue("mdate"));
  275. // tempDoc.setKey(attributes.getValue("key"));
  276. // tempDoc.setDocType("Inproceedings");
  277. currentDoc = "Inproceedings";
  278. if (! hashtable.containsValue(currentDoc)) {
  279. hashtable.put(Integer.toString(hashID), currentDoc);
  280. hashID++;
  281. try {
  282. psInsertGenre.setString(1, currentDoc);
  283. psInsertGenre.addBatch();
  284. }catch (Exception e) {
  285. e.printStackTrace();
  286. }
  287. }
  288. }
  289. }
  290.  
  291.  
  292.  
  293. public void characters(char[] ch, int start, int length) throws SAXException {
  294. tempVal = new String(ch,start,length);
  295. }
  296.  
  297. public void endElement(String uri, String localName, String qName) throws SAXException {
  298.  
  299.  
  300. if (qName.equalsIgnoreCase(currentDoc)){
  301. //add it to the list
  302. try{
  303. if (title!=null)
  304. psInsertDocument.setString(1, title);
  305. else
  306. psInsertDocument.setString(1, "No Title");
  307.  
  308. if (startPage!=null)
  309. psInsertDocument.setString(2, startPage);
  310. else
  311. psInsertDocument.setString(2, "0000");
  312.  
  313. if (endPage!=null)
  314. psInsertDocument.setString(3, endPage);
  315. else
  316. psInsertDocument.setString(3, "0000");
  317.  
  318. if(year!=null)
  319. psInsertDocument.setInt(4, Integer.parseInt(year));
  320. else
  321. psInsertDocument.setInt(4, 0000);
  322.  
  323. if (volume!=null)
  324. psInsertDocument.setInt(5, Integer.parseInt(volume));
  325. else
  326. psInsertDocument.setInt(5, 0000);
  327.  
  328. if (number!=null)
  329. psInsertDocument.setInt(6, Integer.parseInt(number));
  330. else
  331. psInsertDocument.setInt(6, 0000);
  332.  
  333. if (url!=null)
  334. psInsertDocument.setString(7, url);
  335. else
  336. psInsertDocument.setString(7, "No URL");
  337.  
  338. if (ee!=null)
  339. psInsertDocument.setString(8, ee);
  340. else
  341. psInsertDocument.setString(8, "No EE");
  342.  
  343. if (cdrom!=null)
  344. psInsertDocument.setString(9, cdrom);
  345. else
  346. psInsertDocument.setString(9, "No CDRom");
  347.  
  348. if (cite!=null)
  349. psInsertDocument.setString(10, cite);
  350. else
  351. psInsertDocument.setString(10, "No Cite");
  352.  
  353. if (crossref!=null)
  354. psInsertDocument.setString(11, crossref);
  355. else
  356. psInsertDocument.setString(11, "No crossref");
  357.  
  358. if (isbn!=null)
  359. psInsertDocument.setString(12, isbn);
  360. else
  361. psInsertDocument.setString(12, "No isbn");
  362.  
  363. if (series!=null)
  364. psInsertDocument.setString(13, series);
  365. else
  366. psInsertDocument.setString(13, "No series");
  367.  
  368.  
  369. Statement stmt = conn.createStatement();
  370. if (editor!=null) {
  371. if (editor.contains("'"))
  372. editor = editor.replaceFirst("'", "''");
  373.  
  374. ResultSet rs = stmt.executeQuery("select id from tbl_people where name = '"+editor+"'");
  375. int editorID = 0;
  376. while(rs.next())
  377. editorID = rs.getInt("id");
  378. rs.close();
  379. psInsertDocument.setInt(14, editorID);
  380. editor = null;
  381. }
  382. else
  383. psInsertDocument.setInt(14, 0000);
  384.  
  385. if(bookTitle!=null) {
  386. if (bookTitle.contains("'"))
  387. bookTitle = bookTitle.replace("'", "''");
  388.  
  389. ResultSet rs2 = stmt.executeQuery("select id from tbl_booktitle where title = '"+bookTitle+"'");
  390. int bookID = 0;
  391. while(rs2.next())
  392. bookID = rs2.getInt("id");
  393. rs2.close();
  394. psInsertDocument.setInt(15, bookID);
  395. bookTitle = null;
  396. }
  397. else
  398. psInsertDocument.setInt(15, 0000);
  399.  
  400. if(publisher!=null) {
  401. if (publisher.contains("'"))
  402. publisher = publisher.replace("'", "''");
  403.  
  404. ResultSet rs3 = stmt.executeQuery("select id from tbl_publisher where publisher_name = '"+publisher+"'");
  405. int publisherID = 0;
  406. while(rs3.next())
  407. publisherID = rs3.getInt("id");
  408. rs3.close();
  409. psInsertDocument.setInt(16, publisherID);
  410. publisher = null;
  411. }
  412. else
  413. psInsertDocument.setInt(16, 0000);
  414.  
  415.  
  416. psInsertDocument.addBatch();
  417. docCount++;
  418.  
  419. if (docCount > 300) {
  420. try{
  421. psInsertDocument.executeBatch();
  422. psInsertDocument.clearBatch();
  423. }catch (Exception e) {
  424. e.printStackTrace();
  425. }
  426. }
  427.  
  428. }catch (Exception e) {
  429. e.printStackTrace();
  430. }
  431.  
  432. return;
  433. }
  434.  
  435.  
  436. if (qName.equalsIgnoreCase("author")) {
  437. String author = tempVal;
  438. if (! hashtable.containsValue(author)) {
  439. hashtable.put(Integer.toString(hashID), author);
  440. hashID++;
  441. try {
  442. psInsertPeople.setString(1, author);
  443. psInsertPeople.addBatch();
  444. }catch (Exception e) {
  445. e.printStackTrace();
  446. }
  447. return;
  448. }}
  449.  
  450. if (qName.equalsIgnoreCase("editor")) {
  451. editor = tempVal;
  452. if (! hashtable.containsValue(editor)) {
  453. hashtable.put(Integer.toString(hashID), editor);
  454. hashID++;
  455. try {
  456. psInsertPeople.setString(1, editor);
  457. psInsertPeople.addBatch();
  458. }catch (Exception e) {
  459. e.printStackTrace();
  460. }
  461. return;
  462. }
  463. }
  464.  
  465. //tempDoc.setPublisher(tempVal);
  466.  
  467. if (qName.equalsIgnoreCase("title")) {
  468. title = tempVal;
  469. //tempDoc.setTitle(tempVal);
  470. }
  471. if (qName.equalsIgnoreCase("year")) {
  472. year = tempVal;
  473. //tempDoc.setYear(tempVal);
  474. }
  475. if (qName.equalsIgnoreCase("booktitle")) {
  476. bookTitle = tempVal;
  477. if (! hashtable.containsValue(bookTitle)) {
  478. hashtable.put(Integer.toString(hashID), bookTitle);
  479. hashID++;
  480. try {
  481. psInsertTitle.setString(1, bookTitle);
  482. psInsertTitle.addBatch();
  483. }catch (Exception e) {
  484. e.printStackTrace();
  485. }
  486. }
  487. return; }
  488.  
  489. if (qName.equalsIgnoreCase("publisher")) {
  490. publisher = tempVal;
  491. if (! hashtable.containsValue(publisher)) {
  492. hashtable.put(Integer.toString(hashID), publisher);
  493. hashID++;
  494. try {
  495. psInsertPublisher.setString(1, publisher);
  496. psInsertPublisher.addBatch();
  497. }catch (Exception e) {
  498. e.printStackTrace();
  499. }
  500. }return; }
  501. if (qName.equalsIgnoreCase("isbn")) {
  502. tempDoc.setISBN(tempVal);
  503. } if (qName.equalsIgnoreCase("url")) {
  504. url = tempVal;
  505. } if (qName.equalsIgnoreCase("pages")) {
  506. StringTokenizer st = new StringTokenizer(tempVal, "-");
  507. while(st.hasMoreTokens()) {
  508. startPage = st.nextToken();
  509. if (st.hasMoreTokens()) {
  510. endPage = st.nextToken();
  511. }
  512. }
  513. tempDoc.setPages(tempVal);
  514. } if (qName.equalsIgnoreCase("address")) {
  515. tempDoc.setAddress(tempVal);
  516. } if (qName.equalsIgnoreCase("journal")) {
  517. tempDoc.setJournal(tempVal);
  518. } if (qName.equalsIgnoreCase("volume")) {
  519. volume = tempVal;
  520. } if (qName.equalsIgnoreCase("number")) {
  521. number = tempVal;
  522. } if (qName.equalsIgnoreCase("month")) {
  523. tempDoc.setMonth(tempVal);
  524. } if (qName.equalsIgnoreCase("ee")) {
  525. ee = tempVal;
  526. } if (qName.equalsIgnoreCase("cdrom")) {
  527. cdrom = tempVal;
  528. } if (qName.equalsIgnoreCase("cite")) {
  529. cite = tempVal;
  530. } if (qName.equalsIgnoreCase("note")) {
  531. tempDoc.setNote(tempVal);
  532. } if (qName.equalsIgnoreCase("crossref")) {
  533. crossref = tempVal;
  534. } if (qName.equalsIgnoreCase("series")) {
  535. series = tempVal;
  536.  
  537. }
  538. }
  539.  
  540.  
  541. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement