Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- The MIT License (MIT)
- Copyright (c) 2015 sinfonier-project
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
- package com.sinfonier.bolts;
- import org.jsoup.*;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class DigitalSpyUserParse extends BaseSinfonierBolt {
- //TO-DO: Declare variables
- private String htmlfield;
- public DigitalSpyUserParse(String path) {
- super(path);
- }
- @Override
- public void userprepare() {
- // TO-DO: Init values. Code here runs once
- // Get Param (get value of "param_name" from input box)
- this.htmlfield = (String)this.getParam("htmlfield");
- }
- @Override
- public void userexecute() {
- // TO-DO: Write your code here. This code runs once by each input tuple
- // You can use the following functions to process it
- String htmlstring = (String)this.getField("html");
- Document html = Jsoup.parse(htmlstring);
- Elements tds = html.select("td");
- Elements dts = html.select("dt");
- Elements dds = html.select("dd");
- Elements fieldset = html.select("fieldset");
- String username = html.select("div[class=bigusername]").text();
- String joinDate = html.select("div[style=padding:3px]").select("span[class=boldlink]").text();
- String lastConnection = "";
- String location = "";
- int totalPosts = 0;
- int age = 0;
- String bdate = "";
- String occupation = "";
- String services = "";
- String biography = "";
- String gender = "";
- String interests = "";
- String homePage = "";
- String icq = "";
- String msn = "";
- for(Element elemento: tds) {
- if(elemento.text().contains("Last Activity:")) {
- lastConnection = elemento.text().replace("Last Activity: ", "");
- }
- if(elemento.text().contains("Total Posts:") && !elemento.text().contains("Join Date")) {
- String aux = elemento.text().replace("Total Posts: ", "").split(" ")[0];
- if(aux.contains(",")) {
- aux = aux.replace(",", "");
- }
- totalPosts = Integer.parseInt(aux);
- }
- if(elemento.text().contains("Home Page: ")) {
- homePage = elemento.text().replace("Home Page: ", "");
- }
- }
- int i = 0;
- for(Element elemento: dds) {
- if(dts.get(i).text().equals("Age:"))
- {
- age = Integer.parseInt(dds.get(i).text());
- }
- if(dts.get(i).text().equals("Location:"))
- {
- location = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Services:"))
- {
- services = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Biography:"))
- {
- biography = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Gender:"))
- {
- gender = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Interests:"))
- {
- interests = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Occupation:"))
- {
- occupation = dds.get(i).text();
- }
- if(dts.get(i).text().equals("Date of Birth:"))
- {
- bdate = dds.get(i).text();
- }
- i++;
- }
- for(Element elemento: fieldset)
- {
- if(elemento.text().contains("Instant Messaging"))
- {
- if(elemento.text().contains("ICQ"))
- {
- icq = elemento.text().replace("Instant Messaging ICQ ", "");
- }
- if(elemento.text().contains("MSN"))
- {
- msn = elemento.text().replace("Instant Messaging MSN ", "");
- }
- }
- }
- this.addField("source", "DigitalSpy.co.uk");
- this.addField("username", username);
- this.addField("mail", msn);
- this.addField("website", homePage);
- this.addField("location", location);
- this.addField("age", age);
- this.addField("join_date", joinDate);
- this.addField("last_connection", lastConnection);
- this.addField("visits", "");
- this.addField("reputation", "");
- this.addField("bio", biography);
- this.addField("occupation", occupation);
- this.addField("gender", gender);
- this.addField("total_posts", totalPosts);
- this.addField("services", services);
- this.addField("date_of_birth", bdate);
- this.addField("referals", "");
- this.addField("ICQ_number", icq);
- this.addField("msn_messenger", msn);
- this.addField("interests", interests);
- // Mandatory. Emit the tuple to the next bolt
- this.emit();
- }
- public void usercleanup() {
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement