Advertisement
Guest User

Untitled

a guest
Feb 20th, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.70 KB | None | 0 0
  1.  
  2.  
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. import java.util.ArrayList;
  6. import java.util.HashMap;
  7. import java.util.List;
  8. import java.util.Map;
  9. import java.util.Set;
  10. import java.util.regex.*;
  11.  
  12. import org.apache.http.Header;
  13.  
  14. import edu.uci.ics.crawler4j.crawler.Page;
  15. import edu.uci.ics.crawler4j.crawler.WebCrawler;
  16. import edu.uci.ics.crawler4j.fetcher.PageFetcher;
  17. import edu.uci.ics.crawler4j.parser.HtmlParseData;
  18. import edu.uci.ics.crawler4j.url.WebURL;
  19.  
  20. public class MyCrawler extends WebCrawler{
  21.  
  22.  
  23. static int total_count = 0;
  24. static int a_count = 0;
  25. static int b_count = 0;
  26. static int c_count = 0;
  27. static Map<String,Integer> unique_url = new HashMap<String,Integer>();
  28. static Map<String,Integer> in_url = new HashMap<String,Integer>();
  29. static Map<String,Integer> out_url = new HashMap<String,Integer>();
  30.  
  31. private final static Pattern FILTERS=Pattern.compile(".*(\\.(css|js|mid|mp2|mp3|mp4|XML|xml|wav|avi|mov|mpeg|ram|m4v"+"|rm|smil|wmv|swf|wma|zip|rar|gz|php|iso|ico))$");
  32.  
  33. public boolean shouldVisit(Page page, WebURL url) {
  34. String href=url.getURL().toLowerCase();
  35.  
  36. String url_file = "urls_LA.csv";
  37.  
  38. List<String> url_data = new ArrayList<String>();
  39. url_data.add(href.replaceAll(",", "-"));
  40. if(unique_url.containsKey(href.replaceAll(",", "-")))
  41. unique_url.put(href.replaceAll(",", "-"), unique_url.get(href.replaceAll(",", "-"))+1);
  42. else
  43. unique_url.put(href.replaceAll(",", "-"), 1);
  44.  
  45.  
  46. if(href.startsWith("https://www.nytimes.com/")){
  47.  
  48. url_data.add("OK");
  49. if(in_url.containsKey(href.replaceAll(",", "-")))
  50. in_url.put(href.replaceAll(",", "-"), in_url.get(href.replaceAll(",", "-"))+1);
  51. else
  52. in_url.put(href.replaceAll(",", "-"), 1);
  53.  
  54.  
  55. }else{
  56. url_data.add("N_OK");
  57. if(out_url.containsKey(href.replaceAll(",", "-")))
  58. out_url.put(href.replaceAll(",", "-"), out_url.get(href.replaceAll(",", "-"))+1);
  59. else
  60. out_url.put(href.replaceAll(",", "-"), 1);
  61.  
  62. }
  63.  
  64. writeCSV(url_file, url_data);
  65. System.out.println("unique count:: "+ unique_url.size());
  66. System.out.println("in count:: "+ in_url.size());
  67. System.out.println("out count:: "+ out_url.size());
  68. //System.out.println(page.getStatusCode());
  69. return !FILTERS.matcher(href).matches()&&href.startsWith("https://www.nytimes.com/");
  70. }
  71.  
  72. @Override
  73. protected WebURL handleUrlBeforeProcess(WebURL curURL) {
  74. // TODO Auto-generated method stub
  75. String href=curURL.getURL().toLowerCase();
  76. System.out.println("BEFORE PROCESS----> "+href);
  77. return super.handleUrlBeforeProcess(curURL);
  78. }
  79.  
  80.  
  81.  
  82.  
  83. @Override
  84. protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
  85. // TODO Auto-generated method stub
  86. String href=webUrl.getURL().toLowerCase();
  87. String fetch_file = "fetch_LA.csv";
  88. total_count++;
  89. System.out.println("Page Status Error PROCESS----> "+href+"::"+statusCode+ " :: "+total_count);
  90. List<String> fetchData = new ArrayList<String>();
  91. fetchData.add(href.replaceAll(",", "-"));
  92. fetchData.add(String.valueOf(statusCode));
  93. writeCSV(fetch_file,fetchData);
  94. super.handlePageStatusCode(webUrl, statusCode, statusDescription);
  95. }
  96.  
  97. @Override
  98. protected void onUnexpectedStatusCode(String urlStr, int statusCode, String contentType, String description) {
  99. // TODO Auto-generated method stub
  100. System.out.println("Unexpected PROCESS----> "+urlStr+" :: "+statusCode+ " :: "+total_count);
  101. System.out.println("a count:: "+ ++a_count);
  102. List<String> fetchData = new ArrayList<String>();
  103. String fetch_file = "fetch_LA.csv";
  104. fetchData.add(urlStr.replaceAll(",", "-"));
  105. fetchData.add(String.valueOf(statusCode));
  106. writeCSV(fetch_file,fetchData);
  107. super.onUnexpectedStatusCode(urlStr, statusCode, contentType, description);
  108. }
  109.  
  110. @Override
  111. protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
  112. // TODO Auto-generated method stub
  113. List<String> fetchData = new ArrayList<String>();
  114. System.out.println("b count:: "+ ++b_count);
  115. String fetch_file = "fetch_LA.csv";
  116. //fetchData.add(urlStr.replaceAll(",", "-"));
  117. //fetchData.add(String.valueOf(0));
  118. //writeCSV(fetch_file,fetchData);
  119.  
  120. System.out.println("BIGG PROCESS----> "+urlStr);
  121. super.onPageBiggerThanMaxSize(urlStr, pageSize);
  122. }
  123.  
  124. // @Override
  125. /* protected void onContentFetchError(WebURL webUrl) {
  126. // TODO Auto-generated method stub
  127. System.out.println("c count:: "+ ++c_count);
  128. List<String> fetchData = new ArrayList<String>();
  129. String href=webUrl.getURL().toLowerCase();
  130.  
  131. String fetch_file = "fetch_LA.csv";
  132. fetchData.add(href.replaceAll(",", "-"));
  133. fetchData.add(String.valueOf(0));
  134. writeCSV(fetch_file,fetchData);
  135. System.out.println("Fetch Error PROCESS----> "+href);
  136. super.onContentFetchError(webUrl);
  137.  
  138. }*/
  139.  
  140. @Override
  141. protected void onParseError(WebURL webUrl) {
  142. // TODO Auto-generated method stub
  143. String href=webUrl.getURL().toLowerCase();
  144. System.out.println("Parse Error PROCESS----> "+href);
  145. super.onParseError(webUrl);
  146. }
  147.  
  148. @Override
  149. public void visit(Page page) {
  150.  
  151.  
  152. List<String> visitData = new ArrayList<String>();
  153.  
  154. String visit_file = "visit_LA.csv";
  155. String url = page . getWebURL() . getURL();
  156. System . out . println( "URL: " + url );
  157.  
  158. visitData.add(url.replaceAll(",", "-"));
  159. visitData.add(String.valueOf(page.getContentData().length));
  160.  
  161. if (page . getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page . getParseData();
  162. // String text = htmlParseData . getText();
  163. //String html = htmlParseData . getHtml();
  164. Set< WebURL >links = htmlParseData . getOutgoingUrls();
  165. // System.out.println( "Text length: " + text . length());
  166. //System.out.println( "Html length: " + html . length());
  167. //System.out.println( "Number of outgoing links : " + links . size());
  168. //System.out.println("content-length: "+page.getContentData().length);
  169. // System.out.println("HTML: "+html);
  170.  
  171. visitData.add(String.valueOf(links . size()));
  172.  
  173. /*Header[] header = page.getFetchResponseHeaders();
  174.  
  175. for(int i=0; i<header.length;i++)
  176. System.out.println(header[i].getName()+" :: "+header[i].getValue());
  177. */
  178.  
  179.  
  180.  
  181. }
  182. else{
  183. visitData.add(String.valueOf(0));
  184. }
  185. visitData.add(page.getContentType());
  186. writeCSV(visit_file,visitData);
  187. System.out.println("a count:: "+a_count);
  188. System.out.println("b count:: "+ b_count);
  189. System.out.println("c count:: "+ c_count);
  190. System.out.println("total count:: "+ total_count);
  191.  
  192.  
  193. }
  194.  
  195.  
  196. public void writeCSV(String fileName,List<String> data){
  197. FileWriter fileWriter = null;
  198.  
  199. final String COMMA_DELIMITER = ",";
  200. final String NEW_LINE_SEPARATOR = "\n";
  201.  
  202.  
  203.  
  204. try {
  205.  
  206. fileWriter = new FileWriter(fileName,true);
  207.  
  208. //Write a new student object list to the CSV file
  209. for(int i=0;i<data.size();i++){
  210.  
  211. fileWriter.append(data.get(i));
  212.  
  213. if(i!=data.size()-1)
  214. fileWriter.append(COMMA_DELIMITER);
  215. else
  216. fileWriter.append(NEW_LINE_SEPARATOR);
  217.  
  218. }
  219.  
  220.  
  221.  
  222. } catch (Exception e) {
  223.  
  224. System.out.println("Error in CsvFileWriter !!!");
  225.  
  226. e.printStackTrace();
  227.  
  228. } finally {
  229.  
  230.  
  231.  
  232. try {
  233.  
  234. fileWriter.flush();
  235.  
  236. fileWriter.close();
  237.  
  238. } catch (IOException e) {
  239.  
  240. System.out.println("Error while flushing/closing fileWriter !!!");
  241.  
  242. e.printStackTrace();
  243.  
  244. }
  245.  
  246.  
  247. }
  248.  
  249.  
  250.  
  251.  
  252. }
  253.  
  254.  
  255. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement