Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.john.itunes;
- import java.io.BufferedReader;
- import java.io.FileNotFoundException;
- import java.io.FileReader;
- import java.io.IOException;
- import java.io.PrintWriter;
- import java.util.ArrayList;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class iTunesScrapper {
- public static void main (String [] args) throws IOException, InterruptedException {
- ArrayList<String> links = new LinkRetriever().getLinks();
- for (String link : links) {
- new iTunesScrapper().parseData(link);
- // add a delay so itunes wont block our ip, not proven though
- Thread.sleep(60000);
- }
- }
- public void parseData(String url) throws FileNotFoundException, InterruptedException {
- // get the url and parse the page
- String category = url.split("games-")[1].split("/")[0].trim();
- Document doc = null;
- Elements eles = null;
- PrintWriter pw = new PrintWriter("C:/Users/MarionMariah/Desktop/"+category+".txt");
- while (true) {
- try {
- for (Integer letterIndex = 0; letterIndex < 27; letterIndex++) {
- doc = Jsoup.connect(url).get();
- Elements letterElems = doc.select(".list.alpha a");
- doc = Jsoup.connect(letterElems.get(letterIndex).attr("href")).get();
- while (true) {
- eles = doc.select("#selectedcontent div");
- for (Element elements : eles) {
- Elements elems = elements.select("ul li");
- for (Element e : elems) {
- String link = e.select("a").attr("href");
- System.out.println("now parsing... " + link);
- // now for every url get the required details
- while (true) {
- try {
- Document docdetails = Jsoup.connect(link).get();
- for (Integer i = 0; i < 5 && docdetails.select(".loadingbox").size() != 0; i++) {
- Thread.sleep(60000);
- docdetails = Jsoup.connect(link).get();
- }
- String image = docdetails.select("#left-stack").get(0).select("img").attr("src-swap");
- String title = docdetails.select("#title .left h1").html().trim();
- String developer = docdetails.select("#title .left h2").html().trim().replace("By", "").trim();
- String description = docdetails.select(".product-review").first().html();
- description = description.replace("<h4> ", "").replace(" </h4>", "").replace("<p>", "").replace("<br />", "\n").replace("</p>", "");
- description = description.replace(""", "\"").replace("&", "&").replace(" ", " ").replace(">", ">");
- ArrayList<String> screenshots = new ArrayList<String>();
- Elements els = docdetails.select(".content.iphone-screen-shots .lockup img");
- for (Element el : els) {
- screenshots.add(el.attr("src"));
- }
- System.out.println("Title: " + title);
- System.out.println("Developer: " + developer);
- System.out.println("<img href=\"" + image + "\"/>");
- System.out.println(description);
- // print the details in the file
- pw.println("Title: " + title);
- pw.println("Developer: " + developer);
- pw.println("<img href=\"" + image + "\"/>");
- pw.println(description);
- // add the hyperlink
- link = link + "&uo=4&at=10lGio";
- pw.println(link);
- // adding images as string instead of image
- String links = "";
- for (String s : screenshots) {
- links = links + "<img href=\"" + s + "\"/>\n";
- }
- System.out.println(links);
- pw.println("\nScreenShots: \n" + links);
- break;
- } catch (IOException | IndexOutOfBoundsException ex) {
- System.out.println("error encountered, continuing..");
- ex.printStackTrace();
- continue;
- }
- }
- System.out.println("\n###############################################################################\n");
- pw.println("\n###############################################################################\n");
- }
- }
- pw.close();
- Elements nextPage = doc.select(".paginate-more");
- if (nextPage.size() != 0) {
- Thread.sleep(60000);
- doc = Jsoup.connect(nextPage.get(0).attr("href")).get();
- continue;
- } else {
- break;
- }
- }
- }
- } catch (IOException e) {
- System.out.println("error encountered, continuing...");
- e.printStackTrace();
- continue;
- }
- }
- }
- }
- class LinkRetriever {
- BufferedReader br = null;
- public ArrayList<String> getLinks() throws IOException {
- ArrayList<String> links = new ArrayList<String>();
- br = new BufferedReader(new FileReader("C:/Users/MarionMariah/Desktop/sampleFile.txt"));
- String s;
- while ((s = br.readLine()) != null) {
- links.add(s);
- System.out.println(s);
- }
- return links;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement