Advertisement
Guest User

Untitled

a guest
Jul 28th, 2017
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.23 KB | None | 0 0
  1. import java.io.IOException;
  2. import java.util.*;
  3.  
  4. import javax.xml.parsers.ParserConfigurationException;
  5. import javax.xml.parsers.SAXParser;
  6. import javax.xml.parsers.SAXParserFactory;
  7.  
  8. import org.xml.sax.Attributes;
  9. import org.xml.sax.SAXException;
  10.  
  11. import org.xml.sax.helpers.DefaultHandler;
  12.  
  13. import java.sql.*;
  14.  
  15.  
  16. public class ProjectParser extends DefaultHandler{
  17. List myDocs;
  18.  
  19. private String tempVal;
  20. //to maintain context
  21. private Document tempDoc = new Document();
  22. private String currentDoc;
  23.  
  24. static Connection conn = null;
  25.  
  26. static PreparedStatement psInsertGenre = null;
  27. static String sqlInsertGenre = null;
  28.  
  29. static PreparedStatement psInsertPeople = null;
  30. static String sqlInsertPeople = null;
  31.  
  32. static PreparedStatement psInsertTitle = null;
  33. static String sqlInsertTitle = null;
  34.  
  35. static PreparedStatement psInsertPublisher = null;
  36. static String sqlInsertPublisher = null;
  37.  
  38. static PreparedStatement psInsertDocument = null;
  39. static String sqlInsertDocument = null;
  40.  
  41. static Hashtable<String, String> hashtable = new Hashtable<String, String>();
  42. static int hashID = 0;
  43.  
  44.  
  45. // Document Variables
  46. static String title;
  47. static String startPage;
  48. static String endPage;
  49. static String year;
  50. static String volume;
  51. static String number;
  52. static String url;
  53. static String ee;
  54. static String cdrom;
  55. static String crossref;
  56. static String cite;
  57. static String isbn;
  58. static String series;
  59. static String editor;
  60. static String bookTitle;
  61. static String publisher;
  62.  
  63.  
  64. public ProjectParser() {
  65. myDocs = new ArrayList();
  66. }
  67.  
  68.  
  69. public void runExample() {
  70. parseDocument();
  71. }
  72.  
  73. private void parseDocument() {
  74.  
  75. //get a factory
  76. SAXParserFactory spf = SAXParserFactory.newInstance();
  77. try {
  78. //get a new instance of parser
  79. SAXParser sp = spf.newSAXParser();
  80.  
  81. //parse the file and also register this class for call backs
  82. sp.parse("/Users/bshiaw/Desktop/bigData/dblp-data.xml", this);
  83.  
  84. }catch(SAXException se) {
  85. se.printStackTrace();
  86. }catch(ParserConfigurationException pce) {
  87. pce.printStackTrace();
  88. }catch (IOException ie) {
  89. ie.printStackTrace();
  90. }
  91. }
  92.  
  93. /**
  94. * Iterate through the list and print
  95. * the contents
  96. */
  97. private void printData(){
  98.  
  99. System.out.println("Number of Documents '" + myDocs.size() + "'.");
  100.  
  101. Iterator it = myDocs.iterator();
  102. while(it.hasNext()) {
  103. System.out.println(it.next().toString());
  104. }
  105. }
  106.  
  107. public static void main(String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
  108. // Incorporate mySQL driver
  109. Class.forName("com.mysql.jdbc.Driver").newInstance();
  110.  
  111. // Connect to the test database
  112. try {
  113. conn = DriverManager.getConnection("jdbc:mysql:///bookdb","root", "lakers");
  114. }catch (SQLException e) {
  115. e.printStackTrace();
  116. }
  117.  
  118. int[] iNoRows = null;
  119. //For tbl_genres
  120. sqlInsertGenre = "insert into tbl_genres (genre_name) values(?)";
  121. try {
  122. psInsertGenre = conn.prepareStatement(sqlInsertGenre);
  123. }catch (SQLException e) {
  124. e.printStackTrace();
  125. }
  126.  
  127.  
  128. //For tbl_people
  129. sqlInsertPeople = "INSERT INTO tbl_people (name) values(?)";
  130. try {
  131. psInsertPeople = conn.prepareStatement(sqlInsertPeople);
  132. }catch (SQLException e) {
  133. e.printStackTrace();
  134. }
  135. //For tbl_booktitle
  136. sqlInsertTitle = "INSERT INTO tbl_booktitle (title) values(?)";
  137. try {
  138. psInsertTitle = conn.prepareStatement(sqlInsertTitle);
  139. }catch (SQLException e) {
  140. e.printStackTrace();
  141. }
  142. //For tbl_publisher
  143. sqlInsertPublisher = "INSERT INTO tbl_publisher(publisher_name) values(?)";
  144. try {
  145. psInsertPublisher = conn.prepareStatement(sqlInsertPublisher);
  146. }catch (SQLException e) {
  147. e.printStackTrace();
  148. }
  149. //For tbl_dblp_document
  150. sqlInsertDocument = "INSERT INTO tbl_dblp_document(title, start_page, end_page, year, volume, number, url, ee, cdrom, cite, crossref, isbn, series," +
  151. "editor_id, booktitle_id, publisher_id values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
  152. try {
  153. psInsertDocument = conn.prepareStatement(sqlInsertDocument);
  154. }catch (SQLException e) {
  155. e.printStackTrace();
  156. }
  157.  
  158. ProjectParser pp = new ProjectParser();
  159. pp.runExample();
  160. System.out.println("Parsing Complete.\n");
  161.  
  162. try {
  163. iNoRows = psInsertGenre.executeBatch();
  164. }catch (SQLException e) {
  165. e.printStackTrace();
  166. }
  167.  
  168. try {
  169. iNoRows = psInsertPeople.executeBatch();
  170. }catch (SQLException e) {
  171. e.printStackTrace();
  172. }
  173.  
  174. try {
  175. iNoRows = psInsertTitle.executeBatch();
  176. }catch (SQLException e) {
  177. e.printStackTrace();
  178. }
  179.  
  180. try {
  181. iNoRows = psInsertPublisher.executeBatch();
  182. }catch (SQLException e) {
  183. e.printStackTrace();
  184. }
  185.  
  186. try {
  187. iNoRows = psInsertDocument.executeBatch();
  188. }catch (SQLException e) {
  189. e.printStackTrace();
  190. }
  191.  
  192.  
  193. System.out.println("Execute Batch Complete.");
  194.  
  195. try {
  196. if(psInsertGenre!=null) psInsertGenre.close();
  197. if(psInsertPeople!=null) psInsertPeople.close();
  198. if(psInsertTitle!=null) psInsertTitle.close();
  199. if(psInsertPublisher!=null) psInsertPublisher.close();
  200. if(psInsertDocument!=null) psInsertDocument.close();
  201. if(conn!=null) conn.close();
  202.  
  203. }catch (SQLException e) {
  204. e.printStackTrace();
  205. }
  206. }
  207.  
  208.  
  209.  
  210. //Event Handlers
  211. public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
  212. //reset
  213. tempVal = "";
  214.  
  215. if(qName.equalsIgnoreCase("book")) {
  216. //create a new instance of employee
  217. // tempDoc = new Document();
  218. // tempDoc.setmdate(attributes.getValue("mdate"));
  219. // tempDoc.setKey(attributes.getValue("key"));
  220. // tempDoc.setDocType("Book");
  221. currentDoc = "Book";
  222. if (! hashtable.containsValue(currentDoc)) {
  223. hashtable.put(Integer.toString(hashID), currentDoc);
  224. hashID++;
  225. try {
  226. psInsertGenre.setString(1, currentDoc);
  227. psInsertGenre.addBatch();
  228. }catch (Exception e) {
  229. e.printStackTrace();
  230. }
  231. }
  232. }
  233.  
  234. else if (qName.equalsIgnoreCase("incollection")) {
  235. // tempDoc = new Document();
  236. // tempDoc.setmdate(attributes.getValue("mdate"));
  237. // tempDoc.setKey(attributes.getValue("key"));
  238. // tempDoc.setDocType("Incollection");
  239. currentDoc = "Incollection";
  240. if (! hashtable.containsValue(currentDoc)) {
  241. hashtable.put(Integer.toString(hashID), currentDoc);
  242. hashID++;
  243. try {
  244. psInsertGenre.setString(1, currentDoc);
  245. psInsertGenre.addBatch();
  246. }catch (Exception e) {
  247. e.printStackTrace();
  248. }
  249. }
  250. }
  251.  
  252.  
  253. else if (qName.equalsIgnoreCase("proceedings")) {
  254. // tempDoc = new Document();
  255. // tempDoc.setmdate(attributes.getValue("mdate"));
  256. // tempDoc.setKey(attributes.getValue("key"));
  257. // tempDoc.setDocType("Proceedings");
  258. currentDoc = "Proceedings";
  259. if (! hashtable.containsValue(currentDoc)) {
  260. hashtable.put(Integer.toString(hashID), currentDoc);
  261. hashID++;
  262. try {
  263. psInsertGenre.setString(1, currentDoc);
  264. psInsertGenre.addBatch();
  265. }catch (Exception e) {
  266. e.printStackTrace();
  267. }
  268. }
  269.  
  270. }
  271. else if (qName.equalsIgnoreCase("inproceedings")) {
  272. // tempDoc = new Document();
  273. // tempDoc.setmdate(attributes.getValue("mdate"));
  274. // tempDoc.setKey(attributes.getValue("key"));
  275. // tempDoc.setDocType("Inproceedings");
  276. currentDoc = "Inproceedings";
  277. if (! hashtable.containsValue(currentDoc)) {
  278. hashtable.put(Integer.toString(hashID), currentDoc);
  279. hashID++;
  280. try {
  281. psInsertGenre.setString(1, currentDoc);
  282. psInsertGenre.addBatch();
  283. }catch (Exception e) {
  284. e.printStackTrace();
  285. }
  286. }
  287. }
  288. }
  289.  
  290.  
  291.  
  292. public void characters(char[] ch, int start, int length) throws SAXException {
  293. tempVal = new String(ch,start,length);
  294. }
  295.  
  296. public void endElement(String uri, String localName, String qName) throws SAXException {
  297.  
  298.  
  299. if (qName.equalsIgnoreCase(currentDoc)){
  300. //add it to the list
  301. try{
  302. psInsertDocument.setString(1, title);
  303. psInsertDocument.setString(2, startPage);
  304. psInsertDocument.setString(3, endPage);
  305. psInsertDocument.setString(4, year);
  306. psInsertDocument.setString(5, volume);
  307. psInsertDocument.setString(6, number);
  308. psInsertDocument.setString(7, url);
  309. psInsertDocument.setString(8, ee);
  310. psInsertDocument.setString(9, cdrom);
  311. psInsertDocument.setString(10, cite);
  312. psInsertDocument.setString(11, crossref);
  313. psInsertDocument.setString(12, isbn);
  314. psInsertDocument.setString(13, series);
  315. Statement stmt = conn.createStatement();
  316. if (editor.contains('))
  317. editor.replaceFirst("'", "\"");
  318. ResultSet rs = stmt.executeQuery("select id from tbl_people where name = '"+editor+"'");
  319. String editorID = "";
  320. while(rs.next())
  321. editorID = rs.getString("id");
  322. rs.close();
  323. psInsertDocument.setString(14, editorID);
  324.  
  325. if (bookTitle.contains("'"))
  326. bookTitle.replaceFirst("'", "\"");
  327. ResultSet rs2 = stmt.executeQuery("select id from tbl_booktitle where title = '"+bookTitle+"'");
  328. String bookID = "";
  329. while(rs2.next())
  330. bookID = rs2.getString("id");
  331. rs2.close();
  332. psInsertDocument.setString(15, bookID);
  333.  
  334. if (publisher.contains("'"))
  335. publisher.replaceFirst("'", "\"");
  336. ResultSet rs3 = stmt.executeQuery("select id from tbl_publisher where publisher_name = '"+publisher+"'");
  337. String publisherID = "";
  338. while(rs3.next())
  339. publisherID = rs3.getString("id");
  340. rs3.close();
  341. psInsertDocument.setString(16, publisherID);
  342.  
  343. psInsertDocument.addBatch();
  344.  
  345.  
  346. }catch (Exception e) {
  347. e.printStackTrace();
  348. }
  349.  
  350. return;
  351. }
  352.  
  353. if (qName.equalsIgnoreCase("author")) {
  354. String author = tempVal;
  355. if (! hashtable.containsValue(author)) {
  356. hashtable.put(Integer.toString(hashID), author);
  357. hashID++;
  358. try {
  359. psInsertPeople.setString(1, author);
  360. psInsertPeople.addBatch();
  361. }catch (Exception e) {
  362. e.printStackTrace();
  363. }
  364. return;
  365. }}
  366.  
  367. if (qName.equalsIgnoreCase("editor")) {
  368. editor = tempVal;
  369. if (! hashtable.containsValue(editor)) {
  370. hashtable.put(Integer.toString(hashID), editor);
  371. hashID++;
  372. try {
  373. psInsertPeople.setString(1, editor);
  374. psInsertPeople.addBatch();
  375. }catch (Exception e) {
  376. e.printStackTrace();
  377. }
  378. return;
  379. }
  380. }
  381.  
  382. //tempDoc.setPublisher(tempVal);
  383.  
  384. if (qName.equalsIgnoreCase("title")) {
  385. title = tempVal;
  386. //tempDoc.setTitle(tempVal);
  387. }
  388. if (qName.equalsIgnoreCase("year")) {
  389. year = tempVal;
  390. //tempDoc.setYear(tempVal);
  391. }
  392. if (qName.equalsIgnoreCase("booktitle")) {
  393. bookTitle = tempVal;
  394. if (! hashtable.containsValue(bookTitle)) {
  395. hashtable.put(Integer.toString(hashID), bookTitle);
  396. hashID++;
  397. try {
  398. psInsertTitle.setString(1, bookTitle);
  399. psInsertTitle.addBatch();
  400. }catch (Exception e) {
  401. e.printStackTrace();
  402. }
  403. }
  404. return; }
  405.  
  406. if (qName.equalsIgnoreCase("publisher")) {
  407. publisher = tempVal;
  408. if (! hashtable.containsValue(publisher)) {
  409. hashtable.put(Integer.toString(hashID), publisher);
  410. hashID++;
  411. try {
  412. psInsertPublisher.setString(1, publisher);
  413. psInsertPublisher.addBatch();
  414. }catch (Exception e) {
  415. e.printStackTrace();
  416. }
  417. }return; }
  418. if (qName.equalsIgnoreCase("isbn")) {
  419. tempDoc.setISBN(tempVal);
  420. } if (qName.equalsIgnoreCase("url")) {
  421. url = tempVal;
  422. } if (qName.equalsIgnoreCase("pages")) {
  423. StringTokenizer st = new StringTokenizer(tempVal, "-");
  424. while(st.hasMoreTokens()) {
  425. startPage = st.nextToken();
  426. if (st.hasMoreTokens()) {
  427. endPage = st.nextToken();
  428. }
  429. }
  430. tempDoc.setPages(tempVal);
  431. } if (qName.equalsIgnoreCase("address")) {
  432. tempDoc.setAddress(tempVal);
  433. } if (qName.equalsIgnoreCase("journal")) {
  434. tempDoc.setJournal(tempVal);
  435. } if (qName.equalsIgnoreCase("volume")) {
  436. volume = tempVal;
  437. } if (qName.equalsIgnoreCase("number")) {
  438. number = tempVal;
  439. tempDoc.setNumber(tempVal);
  440. } if (qName.equalsIgnoreCase("month")) {
  441. tempDoc.setMonth(tempVal);
  442. } if (qName.equalsIgnoreCase("ee")) {
  443. ee = tempVal;
  444. } if (qName.equalsIgnoreCase("cdrom")) {
  445. cdrom = tempVal;
  446. } if (qName.equalsIgnoreCase("cite")) {
  447. cite = tempVal;
  448. } if (qName.equalsIgnoreCase("note")) {
  449. tempDoc.setNote(tempVal);
  450. } if (qName.equalsIgnoreCase("crossref")) {
  451. crossref = tempVal;
  452. } if (qName.equalsIgnoreCase("series")) {
  453. series = tempVal;
  454. } if (qName.equalsIgnoreCase("school")) {
  455. tempDoc.setSchool(tempVal);
  456. } if (qName.equalsIgnoreCase("chapter")) {
  457. tempDoc.setChapter(tempVal);
  458. }
  459. // }
  460. // else if (qName.equalsIgnoreCase("editor")) {
  461. //// tempDoc.setEditor(tempVal);
  462. // String editor = tempVal;
  463. // if (! hashtable.containsValue(editor)) {
  464. // hashtable.put(Integer.toString(hashID), editor);
  465. // hashID++;
  466. // try {
  467. // psInsertPeople.setString(1, editor+" Editor");
  468. // psInsertPeople.addBatch();
  469. // }catch (Exception e) {
  470. // e.printStackTrace();
  471. // }
  472. // }
  473. // }
  474.  
  475.  
  476. }
  477.  
  478.  
  479. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement