ashutiwari4

Untitled

Mar 31st, 2015
395
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.92 KB | None | 0 0
  1. package kratav.BGC.scrapper;
  2.  
  3. import java.io.File;
  4. import java.io.FileOutputStream;
  5. import java.io.OutputStreamWriter;
  6. import java.io.Writer;
  7. import java.util.ArrayList;
  8.  
  9. import org.jsoup.Jsoup;
  10. import org.jsoup.nodes.Document;
  11. import org.jsoup.nodes.Element;
  12. import org.jsoup.select.Elements;
  13.  
  14. public class MusicCords {
  15. static int x = 1;
  16. static String title = null;
  17. private static String path = "/home/aayush/Development/BollyGuitarCords/Files";
  18. private static String base_file_name = "/home/aayush/Development/BollyGuitarCords/Files/G";
  19. private static ArrayList<String> illegal;
  20. private static String []ignore = {
  21. "http://tabandchord.com/2014/08/veerey-di-wedding-chords-entertainment/",
  22. "http://tabandchord.com/2014/08/independence-day-songs-chords-desh-bhakti-song-chords/",
  23. "http://tabandchord.com/2014/08/independence-day-songs-tab-desh-bhakti-song-tab/",
  24. "http://tabandchord.com/2014/02/samjho-na-kuchh-toh-samjho-na-tab-aapka-surror/",
  25. "http://tabandchord.com/2014/02/samjho-na-kuchh-toh-samjho-na-chords-aapka-surror/",
  26. };
  27. public static void main(String... args) throws Exception {
  28.  
  29. for (int i = 1; i <= 23; i++) {
  30. Document doc = Jsoup
  31. .connect(
  32. "http://tabandchord.com/category/chord/chord-hindi/page/"
  33. + i + "/").userAgent("Mozilla")
  34. .timeout(15 * 1000).get();
  35. Elements e = doc.select(".entry-content");
  36. Elements links = e.select("a[href]");
  37.  
  38.  
  39. outer: for (Element link : links) {
  40. if (link.toString().contains("201")) {
  41. System.out.println(link.attr("href"));
  42.  
  43. if(!link.text().contains("–")){
  44. System.out.println(link.text());
  45. continue;
  46. }
  47. for(String s :ignore){
  48. if(link.attr("href").equals(s)){
  49. continue outer;
  50. }
  51. }
  52. if(!writeToFile(x, link.attr("href"),link.text())){
  53. continue;
  54. }
  55. x++;
  56. }
  57. }
  58. }
  59.  
  60. System.out.println("==== end of road ====");
  61. for(String u : illegal){
  62. System.out.println(u);
  63. }
  64. // Parse Data from each page and write that to a file
  65.  
  66. }
  67.  
  68. public static boolean writeToFile(int x, String url, String title) throws Exception {
  69.  
  70. Document doc = Jsoup.connect(url).userAgent("Mozilla")
  71. .timeout(15 * 1000).get();
  72. String e = doc.select(".entry-content p").toString();
  73.  
  74. System.out.println(e);
  75. if(e.length()==0){
  76. return false;
  77. }
  78. String g;
  79. try {
  80. if (e.toString().contains("</strong>") == true) {
  81. Elements s = doc.select(".entry-content").select("strong");
  82.  
  83. Elements a = s.select("a");
  84.  
  85. if(a.size()>0){
  86. // illegal.add(url);
  87. illegal = new ArrayList<String>();
  88. for (Element el : a) {
  89. Element culprit = el.parent().parent();
  90. illegal.add(culprit.html());
  91. }
  92.  
  93. }
  94.  
  95. System.out.println("\n\n" + s + "\n\n\n");
  96.  
  97. String s1[] = s.toString().split("</strong>");
  98. String K0 = s1[0];
  99. g = e.toString().replace("<p>" + K0 + "</strong></p>", "");
  100. try {
  101. String K1 = s1[1];
  102. g = g.replace("<p>" + K1.substring(1) + "</strong></p>", "");
  103. } catch (Exception m) {
  104. }
  105. }
  106. } catch (Exception ex) {
  107. ex.printStackTrace();
  108. ex.getMessage();
  109. System.exit(0);
  110. }
  111. e= e.replace("<p>.</p>", "");
  112. if(illegal != null){
  113. for (String hate: illegal) {
  114. e = e.replace("<p>"+hate+"</p>", "");
  115. }
  116.  
  117. }
  118. if (title.contains("- Tab And Chord"))
  119. title = title.replace(" - Tab And Chord", "");
  120.  
  121. title = title.split("\\(")[0];
  122. title.trim();
  123.  
  124. new File(path).mkdir();
  125. FileOutputStream fos = new FileOutputStream(base_file_name
  126. + (x) + " - "
  127. + title + ".html");
  128. Writer out = new OutputStreamWriter(fos, "UTF-8");
  129. System.out.println(e.toString());
  130.  
  131. StringBuilder sb = new StringBuilder();
  132. sb.append(
  133. "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">")
  134. .append(e.toString());
  135.  
  136. out.write(sb.toString());
  137. out.close();
  138. return true;
  139.  
  140. }
  141. }
Advertisement
Add Comment
Please, Sign In to add comment