Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package kratav.BGC.scrapper;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.OutputStreamWriter;
- import java.io.Writer;
- import java.util.ArrayList;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class MusicCords {
- static int x = 1;
- static String title = null;
- private static String path = "/home/aayush/Development/BollyGuitarCords/Files";
- private static String base_file_name = "/home/aayush/Development/BollyGuitarCords/Files/G";
- private static ArrayList<String> illegal;
- private static String []ignore = {
- "http://tabandchord.com/2014/08/veerey-di-wedding-chords-entertainment/",
- "http://tabandchord.com/2014/08/independence-day-songs-chords-desh-bhakti-song-chords/",
- "http://tabandchord.com/2014/08/independence-day-songs-tab-desh-bhakti-song-tab/",
- "http://tabandchord.com/2014/02/samjho-na-kuchh-toh-samjho-na-tab-aapka-surror/",
- "http://tabandchord.com/2014/02/samjho-na-kuchh-toh-samjho-na-chords-aapka-surror/",
- };
- public static void main(String... args) throws Exception {
- for (int i = 1; i <= 23; i++) {
- Document doc = Jsoup
- .connect(
- "http://tabandchord.com/category/chord/chord-hindi/page/"
- + i + "/").userAgent("Mozilla")
- .timeout(15 * 1000).get();
- Elements e = doc.select(".entry-content");
- Elements links = e.select("a[href]");
- outer: for (Element link : links) {
- if (link.toString().contains("201")) {
- System.out.println(link.attr("href"));
- if(!link.text().contains("–")){
- System.out.println(link.text());
- continue;
- }
- for(String s :ignore){
- if(link.attr("href").equals(s)){
- continue outer;
- }
- }
- if(!writeToFile(x, link.attr("href"),link.text())){
- continue;
- }
- x++;
- }
- }
- }
- System.out.println("==== end of road ====");
- for(String u : illegal){
- System.out.println(u);
- }
- // Parse Data from each page and write that to a file
- }
- public static boolean writeToFile(int x, String url, String title) throws Exception {
- Document doc = Jsoup.connect(url).userAgent("Mozilla")
- .timeout(15 * 1000).get();
- String e = doc.select(".entry-content p").toString();
- System.out.println(e);
- if(e.length()==0){
- return false;
- }
- String g;
- try {
- if (e.toString().contains("</strong>") == true) {
- Elements s = doc.select(".entry-content").select("strong");
- Elements a = s.select("a");
- if(a.size()>0){
- // illegal.add(url);
- illegal = new ArrayList<String>();
- for (Element el : a) {
- Element culprit = el.parent().parent();
- illegal.add(culprit.html());
- }
- }
- System.out.println("\n\n" + s + "\n\n\n");
- String s1[] = s.toString().split("</strong>");
- String K0 = s1[0];
- g = e.toString().replace("<p>" + K0 + "</strong></p>", "");
- try {
- String K1 = s1[1];
- g = g.replace("<p>" + K1.substring(1) + "</strong></p>", "");
- } catch (Exception m) {
- }
- }
- } catch (Exception ex) {
- ex.printStackTrace();
- ex.getMessage();
- System.exit(0);
- }
- e= e.replace("<p>.</p>", "");
- if(illegal != null){
- for (String hate: illegal) {
- e = e.replace("<p>"+hate+"</p>", "");
- }
- }
- if (title.contains("- Tab And Chord"))
- title = title.replace(" - Tab And Chord", "");
- title = title.split("\\(")[0];
- title.trim();
- new File(path).mkdir();
- FileOutputStream fos = new FileOutputStream(base_file_name
- + (x) + " - "
- + title + ".html");
- Writer out = new OutputStreamWriter(fos, "UTF-8");
- System.out.println(e.toString());
- StringBuilder sb = new StringBuilder();
- sb.append(
- "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">")
- .append(e.toString());
- out.write(sb.toString());
- out.close();
- return true;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment