Advertisement
Guest User

Untitled

a guest
Sep 11th, 2018
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.69 KB | None | 0 0
  1. package com.trungnt.linkedin;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.DataInputStream;
  5. import java.io.File;
  6. import java.io.FileInputStream;
  7. import java.io.IOException;
  8. import java.io.InputStreamReader;
  9. import java.net.URLEncoder;
  10. import java.util.ArrayList;
  11. import java.util.Arrays;
  12. import java.util.HashSet;
  13. import java.util.LinkedHashSet;
  14. import java.util.List;
  15. import java.util.regex.Matcher;
  16.  
  17. import net.sf.json.JSONArray;
  18. import net.sf.json.JSONObject;
  19. import net.sf.json.JSONSerializer;
  20.  
  21. import org.apache.http.client.HttpClient;
  22. import org.apache.log4j.Logger;
  23.  
  24. import com.trungnt.linkedin.dto.BaseExtractor;
  25. import com.trungnt.linkedin.dto.Contact;
  26. import com.trungnt.linkedin.dto.Experience;
  27. import com.trungnt.util.CSVHelper;
  28. import com.trungnt.util.LinkedInHelper;
  29. import com.trungnt.util.Util;
  30. public class PeopleSearchExtractor extends BaseExtractor{
  31. static Logger logger = Logger.getLogger(PeopleSearchExtractor.class);
  32. String threadID = "";
  33. public static boolean separatedRowForMutualConnection = !Messages.getString("ContactCollector.separatedRowForMutualConnection").startsWith("!");
  34. public static boolean newFile = !Messages.getString("ContactCollector.newFile").startsWith("!");
  35. public static boolean sameFile = !Messages.getString("ContactCollector.sameFile").startsWith("!");
  36. public static boolean excludeLinkedInMembers = !Messages.getString("ContactCollector.excludeLinkedInMembers").startsWith("!");
  37. public static boolean additionalEmails = !Messages.getString("ContactCollector.additionalEmails").startsWith("!");
  38. public static boolean debugMode = !Messages.getString("ContactCollector.debugMode").startsWith("!");
  39. public static boolean fromFile = !Messages.getString("ContactCollector.fromFile").startsWith("!");
  40. public static int toofrAcceptedConfidence = Messages.getString("ContactCollector.toofrAcceptedConfidence").startsWith("!")?0:Integer.parseInt(Messages.getString("ContactCollector.toofrAcceptedConfidence"));
  41. public boolean validToofr = false;
  42. public static boolean multipleSearches = !Messages.getString("ContactCollector.multipleSearches").startsWith("!");
  43. public static void main(String[] args){
  44. Util.updateProgram("https://www.dropbox.com/s/ec7618px99sul14/version.txt?raw=1", "https://www.dropbox.com/s/llsnf5gec2dwi7s/peoplesearch.jar?raw=1", "peoplesearch.jar");
  45. if (fromFile){
  46. new PeopleSearchExtractor().processFromFile(args);
  47. } else {
  48. new PeopleSearchExtractor().process(args);
  49. }
  50. }
  51. public void process(String[] args) {
  52. String username = "";
  53. String password = "";
  54. if (args.length >=2){
  55. username = args[0];
  56. password = args[1];
  57. threadID = args[2];
  58. } else {
  59. username = Messages.getString("ContactCollector.username"); //$NON-NLS-1$
  60. password = Messages.getString("ContactCollector.password"); //$NON-NLS-1$
  61. }
  62. HttpClient client = Util.getHTTPClient();
  63. int planId = skipLicense?Util.PLAN_PLUS:Util.hasValidLicense(username, password, Util.PRODUCT_SEARCH_CONTACT_EXPORT);
  64. if (planId == -1) return;
  65. try {
  66. LinkedInHelper.login(client, username, password);
  67. String output = "";
  68. try{
  69. validToofr = (Util.hasToofrLicense(username)!=-1);
  70. FileInputStream fstream = new FileInputStream("input" + threadID + ".txt");
  71. // Get the object of DataInputStream
  72. DataInputStream in = new DataInputStream(fstream);
  73. BufferedReader br = new BufferedReader(new InputStreamReader(in));
  74. String searchURL;
  75. int count = 1;
  76. // Read File Line By Line
  77. while ((searchURL = br.readLine()) != null) {
  78. if (searchURL.startsWith("#")) continue;
  79. searchURL = searchURL.replaceAll(" ", "%20");
  80. String name = "";
  81. if (searchURL.indexOf("@@@") > -1){
  82. name = searchURL.split("@@@")[0];
  83. searchURL = searchURL.split("@@@")[1];
  84. }
  85. boolean hasNext = true;
  86. int page = 1;
  87. Matcher pageNumMatch = Util.parse(searchURL, "&page[^=]*=(\\d+)");
  88. if (pageNumMatch.find()){
  89. page = Integer.parseInt(pageNumMatch.group(1));
  90. }
  91. String originalSearchURL = searchURL.replaceAll("&page[^=]*=(\\d+)", "");
  92. HashSet<String> ids = new LinkedHashSet<String>();
  93. int fileCount = 1;
  94. if (!sameFile){
  95. if (!newFile){
  96. fileCount = count;
  97. } else {
  98. while (new File("output" + threadID + "-" + fileCount + ".csv").exists()){
  99. fileCount++;
  100. }
  101. }
  102. }
  103. while (hasNext){
  104. if (ids.size() > numberToDownload) break;
  105. System.out.println("Open search page:" + searchURL);
  106. output = Util.getContent(client, searchURL);
  107. if (debugMode) logger.error(output);
  108. JSONArray arr = new JSONArray();
  109. String content = Util.getDataletContent(output, "/voyager/api/search/cluster");
  110. if (content == null){//v2
  111. content = Util.getDataletContent(output, "/voyager/api/search/blended");
  112. JSONObject outter = ((JSONObject) JSONSerializer.toJSON(content));
  113. JSONArray data = outter.getJSONObject("data").getJSONArray("elements");
  114. JSONObject searchCluster = Util.findJSONObjectWithAttribute(data, "type", "SEARCH_HITS");
  115. JSONArray results = Util.getJsonArray(searchCluster, "elements");
  116. for (int i=0; i<results.size(); i++){
  117. try{
  118. JSONObject obj = new JSONObject();
  119. JSONObject personObj = new JSONObject();
  120. personObj.put("link_nprofile_view_1", "https://www.linkedin.com/in/" + URLEncoder.encode(Util.getString(results.getJSONObject(i), "publicIdentifier"), "UTF-8"));
  121. personObj.put("fmt_name", Util.getString(results.getJSONObject(i).getJSONObject("title"), "text"));
  122. personObj.put("fmt_headline", Util.getString(results.getJSONObject(i).getJSONObject("headline"), "text"));
  123. obj.put("person", personObj);
  124. arr.add(obj);
  125. } catch (Exception e){
  126. logger.error(e.getMessage());
  127. }
  128. }
  129.  
  130. } else {
  131. JSONObject outter = ((JSONObject) JSONSerializer.toJSON(content));
  132. JSONArray includes = outter.getJSONArray("included");
  133. JSONObject searchCluster = Util.findJSONObjectWithAttribute(includes, "$type", "com.linkedin.voyager.search.SearchCluster");
  134. JSONArray results = Util.getJsonArray(searchCluster, "elements");
  135. for (int i=0; i<results.size(); i++){
  136. JSONObject hitInfo = Util.findJSONObjectWithAttribute(includes, "$id", results.getString(i)).getJSONObject("hitInfo");
  137. if (hitInfo != null){
  138. JSONObject profileObj = Util.findJSONObjectWithAttribute(includes, "$id", Util.getString(hitInfo, "com.linkedin.voyager.search.SearchProfile"));
  139. String miniProfile = Util.getString(profileObj, "miniProfile");
  140. JSONObject profileObj2 = Util.findJSONObjectWithAttribute(includes, "entityUrn", miniProfile);
  141. if (profileObj2 != null){
  142. JSONObject obj = new JSONObject();
  143. JSONObject personObj = new JSONObject();
  144. personObj.put("link_nprofile_view_1", "https://www.linkedin.com/in/" + URLEncoder.encode(Util.getString(profileObj2, "publicIdentifier"), "UTF-8"));
  145. personObj.put("fmt_name", Util.getString(profileObj2, "firstName") + " " + Util.getString(profileObj2, "lastName"));
  146. personObj.put("fmt_headline", Util.getString(profileObj2, "occupation"));
  147. obj.put("person", personObj);
  148. arr.add(obj);
  149. }
  150. }
  151. }
  152. }
  153. //result in Json
  154. hasNext = false;
  155. searchURL = originalSearchURL + "&page=" + ++page; //http://www.linkedin.com" + search.getJSONObject("baseData").getJSONObject("resultPagination").getJSONObject("nextPage").getString("pageURL");
  156. for (int i=0; i<arr.size(); i++){
  157. try{
  158. if (ids.size() > numberToDownload) break;
  159. if (!arr.getJSONObject(i).has("person")) continue;
  160. hasNext = true;
  161. String url = Util.createURL("https://www.linkedin.com", Util.getProfileURL(arr.getJSONObject(i).getJSONObject("person")));
  162. String id = URLEncoder.encode(url.substring(url.lastIndexOf("/") + 1).split("&")[0]);
  163. if (ids.contains(id)){
  164. continue;
  165. }
  166. if (multipleSearches){
  167. if (new File("done/" + id).exists()){
  168. System.out.println("Downloaded contact with ID=" + id);
  169. continue;
  170. }
  171. }
  172.  
  173. ids.add(id);
  174. Contact contact = createContact(planId);
  175. contact.hasToofr = validToofr;
  176. contact.toofrAcceptedConfidence = toofrAcceptedConfidence;
  177. contact.hasCompanyURL = contact.hasCompanyURL || contact.hasToofr;
  178. contact.inputURL = originalSearchURL;
  179. contact.fullName = Util.getString(arr.getJSONObject(i).getJSONObject("person"), "fmt_name");
  180. if ("".equals(contact.fullName)){
  181. contact.fullName = "LinkedIn Member";
  182. }
  183. if (excludeLinkedInMembers)
  184. if ("LinkedIn Member".equals(contact.fullName)) continue;
  185. if (contact.fullName.indexOf(' ') > -1){
  186. contact.firstName = contact.fullName.substring(0, contact.fullName.indexOf(' '));
  187. contact.lastName = contact.fullName.substring(contact.fullName.indexOf(' ') + 1).trim();
  188. } else {
  189. contact.firstName = contact.fullName;
  190. }
  191. if (planId == Util.PLAN_BASIC){
  192. System.out.println("Extract info for:" + contact.fullName);
  193. }
  194. contact.headline = Util.getString(arr.getJSONObject(i).getJSONObject("person"), "fmt_headline");
  195. contact.industry = Util.getString(arr.getJSONObject(i).getJSONObject("person"), "fmt_industry");
  196. contact.location = Util.getString(arr.getJSONObject(i).getJSONObject("person"), "fmt_location");
  197. contact.distance = Util.getString(arr.getJSONObject(i).getJSONObject("person"), "distance");
  198. contact.linkedInURL = url.split("&")[0];
  199. if (planId >= Util.PLAN_PLUS){
  200. Util.parseContactNew(contact, url , client);
  201. Thread.sleep(delayMin + rand.nextInt(delay - delayMin));
  202. if (multipleSearches){
  203. CSVHelper.writeLine("done/" + id, new String[]{contact.fullName, contact.linkedInURL});
  204. }
  205.  
  206. }
  207. if (! new File("output" + threadID + "-" + fileCount + ".csv").exists()){
  208. String[] out = contact.toCSVHeader();
  209. if (!"".equals(name)){
  210. List<String> wordList = new ArrayList<String>(Arrays.asList(out));
  211. wordList.add(0, "Person");
  212. out = wordList.toArray(new String[0]);
  213.  
  214. }
  215. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", out);
  216. }
  217. if (separatedRowForMutualConnection){
  218. String[] mutualConnections = contact.mutualText.split("\n");
  219. for (String mutualConnection:mutualConnections){
  220. contact.mutualText = mutualConnection;
  221. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", contact.toCSVValues());
  222. }
  223. } else {
  224. String[] out = contact.toCSVValues();
  225. if (!"".equals(name)){
  226. List<String> wordList = new ArrayList<String>(Arrays.asList(out));
  227. wordList.add(0, name);
  228. out = wordList.toArray(new String[0]);
  229.  
  230. }
  231. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", out);
  232. }
  233. if (additionalEmails){
  234. for (Experience experience:contact.experienceArr){
  235. try{
  236. if ("".equals(experience.company.website)) continue;
  237. String domain = experience.company.website.replaceAll("http(s)*://www\\.", "").split("/")[0];
  238. String firstName = contact.firstName.split(",")[0];
  239. String lastName = contact.lastName.split(",")[0];
  240. if (lastName.lastIndexOf(' ') > -1){
  241. lastName = lastName.substring(lastName.lastIndexOf(' ') + 1);
  242. }
  243. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, contact.email, contact.phone});
  244. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, firstName + "." + lastName + "@" + domain, contact.phone});
  245. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, lastName + "." + firstName + "@" + domain, contact.phone});
  246. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, firstName.substring(0, 1) + lastName + "@" + domain, contact.phone});
  247. break;
  248. } catch (Exception e){
  249.  
  250. }
  251. }
  252. }
  253.  
  254. } catch (Exception e){
  255. logger.error(e.getMessage());
  256. }
  257. }
  258. }
  259. count++;
  260. }
  261. in.close();
  262. } catch (Exception e){
  263. logger.error(e.getMessage() + output);
  264. }
  265. } catch (IOException e) {
  266. logger.error(e.getMessage());
  267.  
  268. }
  269.  
  270. }
  271. public void processFromFile(String[] args) {
  272. String username = "";
  273. String password = "";
  274. if (args.length >=2){
  275. username = args[0];
  276. password = args[1];
  277. threadID = args[2];
  278. } else {
  279. username = Messages.getString("ContactCollector.username"); //$NON-NLS-1$
  280. password = Messages.getString("ContactCollector.password"); //$NON-NLS-1$
  281. }
  282. HttpClient client = Util.getHTTPClient();
  283. int planId = Util.hasValidLicense(username, password, Util.PRODUCT_SEARCH_CONTACT_EXPORT);
  284. if ((!skipLicense) && (planId == -1)) return;
  285. try {
  286. LinkedInHelper.login(client, username, password);
  287. try{
  288. FileInputStream fstream = new FileInputStream("input" + threadID + ".txt");
  289. // Get the object of DataInputStream
  290. DataInputStream in = new DataInputStream(fstream);
  291. BufferedReader br = new BufferedReader(new InputStreamReader(in));
  292. String searchURL;
  293. int count = 1;
  294. // Read File Line By Line
  295. while ((searchURL = br.readLine()) != null) {
  296. if (searchURL.startsWith("#")) continue;
  297. int fileCount = 1;
  298. searchURL = searchURL.replaceAll(" ", "%20");
  299. Contact contact = createContact(planId);
  300. Util.parseContactNew(contact, searchURL , client);
  301. Thread.sleep(delayMin + rand.nextInt(delay - delayMin));
  302. if (! new File("output" + threadID + "-" + fileCount + ".csv").exists()){
  303. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", contact.toCSVHeader());
  304. }
  305. if (separatedRowForMutualConnection){
  306. String[] mutualConnections = contact.mutualText.split("\n");
  307. for (String mutualConnection:mutualConnections){
  308. contact.mutualText = mutualConnection;
  309. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", contact.toCSVValues());
  310. }
  311. } else {
  312. CSVHelper.writeLine("output" + threadID + "-" + fileCount + ".csv", contact.toCSVValues());
  313. }
  314. if (additionalEmails){
  315. for (Experience experience:contact.experienceArr){
  316. try{
  317. if ("".equals(experience.company.website)) continue;
  318. String domain = experience.company.website.replaceAll("http(s)*://www\\.", "").split("/")[0];
  319. String firstName = contact.firstName.split(",")[0];
  320. String lastName = contact.lastName.split(",")[0];
  321. if (lastName.lastIndexOf(' ') > -1){
  322. lastName = lastName.substring(lastName.lastIndexOf(' ') + 1);
  323. }
  324. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, contact.email, contact.phone});
  325. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, firstName + "." + lastName + "@" + domain, contact.phone});
  326. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, lastName + "." + firstName + "@" + domain, contact.phone});
  327. CSVHelper.writeLine("outputAdditional.csv", new String[]{firstName, lastName, contact.headline, experience.companyName, contact.location, contact.industry, contact.numberOfConnections, firstName.substring(0, 1) + lastName + "@" + domain, contact.phone});
  328. break;
  329. } catch (Exception e){
  330.  
  331. }
  332. }
  333. }
  334. count++;
  335. }
  336. in.close();
  337. } catch (Exception e){
  338. logger.error(e.getMessage());
  339. }
  340. } catch (IOException e) {
  341. logger.error(e.getMessage());
  342.  
  343. }
  344.  
  345. }
  346. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement