Advertisement
Skelun

F95Zone Thread Scraper v1.41

Nov 4th, 2023
1,140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /* ====================================================================
  2. ==                      [F95ZONE] THREAD SCRAPER
  3. =======================================================================
  4. -- MODIFIED : 2023-11-04
  5. -- CREATED  : 2023-10-27
  6. -- LICENSE  : MIT license (Just not sell it ...)
  7. -- --------------------------------------------------------------------
  8. -- Scrapes the data with CheerioGS (https://github.com/tani/cheeriogs)
  9. -- To use it, first install the library using the Script ID
  10. -- 1ReeQ6WO8kKNxoaA_O0XEQ589cIrRvEBA9qcWpNqdOP17i47u6N9M5Xh0
  11. ======================================================================= */
  12.  
  13. /* --------------------------------------------------------------------
  14. -- CheerioGS starting function
  15. ----------------------------------------------------------------------- */
  16.  
  17. function getContent(url) {
  18.   const response = UrlFetchApp.fetch(url, {muteHttpExceptions: true});
  19.   return response.getResponseCode() == 200 ? response.getContentText() : '';
  20. }
  21.  
  22. /* --------------------------------------------------------------------
  23. -- Actual scrape function
  24. ----------------------------------------------------------------------- */
  25.  
  26. function scrapeData(force) {
  27.  
  28.   var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  29.   var selectedRange = sheet.getActiveRange();
  30.   var selectedValues = selectedRange.getRichTextValues();
  31.  
  32.   for (var i = 0; i < selectedValues.length; i++) {
  33.  
  34.     // It assumes the URLs are in the first column of the selected column
  35.     var urlFromLink = selectedValues[i][0].getLinkUrl();
  36.     var urlFromCellValue = selectedValues[i][0].getText();
  37.  
  38.     // Check if there is a link URL, and use it if available; otherwise, use the URL from the cell value
  39.     var url = urlFromLink ? urlFromLink : urlFromCellValue;
  40.  
  41.     // If the value is not empty...
  42.     if (url && isValidURL(url)) {
  43.  
  44.       var content = getContent(url);
  45.       var $ = Cheerio.load(content);
  46.  
  47.       /* --------------------------------------------------------------------
  48.       -- [Title]
  49.       -------------------------------------------------------------------- */
  50.       const threadTitle = $('h1.p-title-value');
  51.  
  52.       const rawTitle = threadTitle.contents().filter(function() {
  53.         return this.type === 'text';
  54.       }).text();
  55.  
  56.       const titleREGEX = /(.+?)\s*\[(.+?)\](?:\s*\[(.+?)\])?/.exec(rawTitle);
  57.  
  58.       var title = titleREGEX && titleREGEX[1] ? titleREGEX[1] : "Title N.F. ⚠️";
  59.       const version = titleREGEX && titleREGEX[2] ? titleREGEX[2] : "Version N.F. ⚠️";
  60.       const devBackup = titleREGEX && titleREGEX[3] ? titleREGEX[3] + " ❓" : "Dev N.F. ⚠️";
  61.  
  62.       // If there's no regex match, assume rawTitle is the title
  63.       title = !titleREGEX ? rawTitle : title;
  64.  
  65.       /* --------------------------------------------------------------------
  66.       -- [Status]
  67.       -------------------------------------------------------------------- */
  68.       // Default status
  69.       var status = 'Active';
  70.  
  71.       // Search for "Completed", "Abandoned", or "Onhold"
  72.       const statusSpan = threadTitle.find('span').filter(function() {
  73.         return [
  74.           'Completed',
  75.           'Abandoned',
  76.           'Onhold'
  77.         ].includes($(this).text());
  78.       }).first();
  79.  
  80.       statusSpan.length ? status = statusSpan.text() : null;
  81.  
  82.       /* --------------------------------------------------------------------
  83.       -- [Engine]
  84.       ----------------------------------------------------------------------- */
  85.       // If there's no engine info, it returns "unknown"
  86.       var engine = '⚠️ Unknown';
  87.  
  88.       const engineSpan = threadTitle.find('span').filter(function() {
  89.         return [
  90.           'ADRIFT','Flash','Java',
  91.           'Others','QSP','RAGS',
  92.           'RPGM',"Ren'Py",'Tads',
  93.           'Unity','Unreal Engine','WebGL',
  94.           'Wolf RPG','HTML'
  95.         ].includes($(this).text());
  96.       }).first();
  97.  
  98.       engineSpan.length ? engine = engineSpan.text() : null;  
  99.  
  100.       /* --------------------------------------------------------------------
  101.       -- [Tags]
  102.       ----------------------------------------------------------------------- */
  103.       // Selects the element containing the tags
  104.       var tagList = $('span.js-tagList');
  105.  
  106.       // Grabs the tags and separate them by comma
  107.       var tags = tagList.find('a.tagItem').map(function() {
  108.         return $(this).text();
  109.       }).get().join(', ');
  110.  
  111.       /* --------------------------------------------------------------------
  112.       -- [Gameplay]
  113.       ----------------------------------------------------------------------- */
  114.       const gameplayOptions = {
  115.         'sandbox': 'Sandbox',
  116.         'turn based combat': 'TBC',
  117.         'management': 'Management',
  118.         'simulator': 'Simulator',
  119.         'rpg': 'RPG'
  120.       };
  121.      
  122.       const gameplay = gameplayOptions[Object.keys(gameplayOptions).find(option => tags.includes(option))] || 'Visual Novel';
  123.      
  124.       /* --------------------------------------------------------------------
  125.       -- Thread [Start] Date
  126.       ----------------------------------------------------------------------- */
  127.  
  128.       // Selects the first 'article' element
  129.       const timeElement = $('article').first().find('header.message-attribution').find('time.u-dt');
  130.  
  131.       // Check if the time element exists
  132.       var start = timeElement.length > 0 ? timeElement.attr('datetime').split('T')[0] : (Logger.log("<time> element not found."), null);
  133.  
  134.       /* --------------------------------------------------------------------
  135.       -- [Updated]
  136.       -- [Release]
  137.       -- [Dev]
  138.       -- [Censored]
  139.       ----------------------------------------------------------------------- */
  140.       // Selects the first message body (which is the one containing the game info)
  141.       const articleText = $('article.message-body').first().text();
  142.  
  143.       const updated = /Thread Updated:\s*([^\s]+)/.exec(articleText)?.[1].toString() || "Not found ⚠️";
  144.       const release = /(Release|Update) Date:\s*([^\s]+)/.exec(articleText)?.[2] || "Not found ⚠️";
  145.  
  146.       /* Tries to get the developer's name
  147.       ------------------------------------------------------------------------------- */
  148.       const devStopWords = [
  149.         'Patreon', 'F95Zone', 'Itch\\.io', 'Discord', 'SubcribeStar', 'Buy Me A Coffee',
  150.         'Blogspot', 'Website', 'Twitter', 'Pixiv', 'Fanbox', '\\-'
  151.       ];
  152.  
  153.       // Use the array directly in the regex construction
  154.       const devMatchRegex = new RegExp(`Developer\\s*(?:\\/\\s*Publisher)?\\s*:\\s*((?:(?!${devStopWords.join('|')}).)+)`, 'i');
  155.  
  156.       // Use the regex to execute a match on articleText
  157.       const devMatch = devMatchRegex.exec(articleText)?.[1]?.trim();
  158.  
  159.       // Check if devMatch is not undefined, not "You", and different from the devBackup (without warning signs)
  160.       const isDifferentFromBackup = devMatch && devMatch !== "You" && devMatch !== devBackup.replace(" ❓", "").replace(" ⚠️", "").trim();
  161.  
  162.       // Set dev to devMatch or devBackup. Append a warning sign if it is different from the backup.
  163.       const dev = isDifferentFromBackup ? devMatch + " ⚠️" : devMatch || devBackup;
  164.  
  165.       Logger.log("Dev: " + dev);
  166.       Logger.log("Backup: " + devBackup);
  167.       /* ---------------------------------------------------------------------------- */
  168.  
  169.       const censored = /Censored:\s*(.+?)(?=\n|\r|\r\n)/.exec(articleText)?.[1] || "Prob. Not";
  170.  
  171.       /* --------------------------------------------------------------------
  172.       -- [RATING] & [VOTES]
  173.       ----------------------------------------------------------------------- */
  174.       // Grabs the JSON generated by F95 forum
  175.       const jsonInfo = $('script[type="application/ld+json"]').last();
  176.  
  177.       // Extract the captured values or provide default values if not found
  178.       const rating = /"ratingValue": "(\d+(?:\.\d+)?)"/.exec(jsonInfo)?.[1] || "No rating yet";
  179.       const votes = /"ratingCount": "(\d+)"/.exec(jsonInfo)?.[1] || "0";
  180.  
  181.       /* --------------------------------------------------------------------
  182.       -- Last Update Check
  183.       -- --------------------------------------------------------------------
  184.       -- It will not scrape if the thread was updated less than 30 days ago
  185.       ----------------------------------------------------------------------- */
  186.  
  187.       const today = new Date();
  188.  
  189.       // Subtracts both dates to get days gap
  190.       const daysSinceLastUpdate = Math.floor((today - new Date(updated)) / (1000 * 60 * 60 * 24));
  191.       const daysSinceRelease = Math.floor((today - new Date(release)) / (1000 * 60 * 60 * 24));
  192.  
  193.       /* --------------------------------------------------------------------
  194.       -- Starts to write the data into different cells
  195.       ----------------------------------------------------------------------- */
  196.  
  197.       if (force || daysSinceRelease < 30 || daysSinceLastUpdate > 30) {
  198.  
  199.           // Write the scraped data to the same row
  200.           var outputRow = selectedRange.getRow() + i;
  201.  
  202.           // Write the scraped data to adjacent columns
  203.           var outputColumn = selectedRange.getColumn() + 1;
  204.  
  205.           // All values that will be written
  206.           var newValues = [
  207.               //rawTitle,
  208.               title,
  209.               version,
  210.               dev,
  211.               status,
  212.               engine,
  213.               gameplay,
  214.               tags,
  215.               moonRating(rating),
  216.               votes,
  217.               //updated,
  218.               release,
  219.               censored,
  220.               start
  221.           ];
  222.  
  223.           /* Creates a note into the [Tags] cell to tell the exact rating
  224.           ------------------------------------------------------------------------------- */
  225.           var tagsIndex = newValues.indexOf(tags);
  226.           tagsIndex !== -1 && sheet.getRange(outputRow, outputColumn + tagsIndex).setNote(tags);
  227.  
  228.           /* Creates a note into the [moonRating(rating)] cell to tell the exact rating
  229.           ------------------------------------------------------------------------------- */
  230.           var moonRatingIndex = newValues.indexOf(moonRating(rating));
  231.           moonRatingIndex !== -1 && sheet.getRange(outputRow, outputColumn + moonRatingIndex).setNote('Rating: ' + rating);
  232.  
  233.           /* Ignores the writing of cells with existing values
  234.           ------------------------------------------------------------------------------- */
  235.           // Read existing values in the row
  236.           var rangeToWrite = sheet.getRange(outputRow, outputColumn, 1, newValues.length);
  237.           var existingValues = rangeToWrite.getValues()[0];
  238.  
  239.           // Compare existing values with new values
  240.           for (var j = 0; j < existingValues.length; j++) {
  241.  
  242.             // If the new value is empty, keep the existing value
  243.             if (newValues[j] === "" || newValues[j] == null) {
  244.               newValues[j] = existingValues[j];
  245.             }
  246.           }
  247.  
  248.           // Set only the new non-empty values
  249.           rangeToWrite.setValues([newValues]);
  250.       }
  251.  
  252.     }
  253.  
  254.   }
  255.  
  256. }
  257.  
  258. /* --------------------------------------------------------------------
  259. -- Ignore all restrictions when scraping
  260. ----------------------------------------------------------------------- */
  261.  
  262. const forceScrapeData = () => scrapeData(true);
  263.  
  264. /* --------------------------------------------------------------------
  265. -- Transforms embedded links into actual Hyperlinks
  266. ----------------------------------------------------------------------- */
  267.  
  268. function createHyperlink() {
  269.   const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  270.   const range = sheet.getActiveRange();
  271.   const formulas = range.getRichTextValues().map((row) =>
  272.     row.map((richText) => {
  273.       const link = richText.getLinkUrl();
  274.       return link ? `=HYPERLINK("${link}", "Here")` : '';
  275.     })
  276.   );
  277.  
  278.   // Set formulas in bulk
  279.   range.setFormulas(formulas);
  280. }
  281.  
  282. /* ====================================================================
  283. == HELPER FUNCTIONS
  284. ======================================================================= */
  285.  
  286. /* --------------------------------------------------------------------
  287. -- MoonRating -- Simulates rating stars with emojis.
  288. ----------------------------------------------------------------------- */
  289.  
  290. function moonRating(number) {
  291.   if (isNaN(number)) return "------";
  292.  
  293.   const fullMoon = "🌕", halfMoon = "🌗", newMoon = "🌑";
  294.   const moons = Math.floor(number);
  295.   const isHalfMoon = number % 1 !== 0;
  296.  
  297.   return fullMoon.repeat(moons) +
  298.          (isHalfMoon ? halfMoon : "") +
  299.          newMoon.repeat(5 - moons - (isHalfMoon ? 1 : 0));
  300. }
  301.  
  302. /* --------------------------------------------------------------------
  303. -- Check if the strings is a valid URL
  304. ----------------------------------------------------------------------- */
  305. function isValidURL(str) {
  306.   var pattern = new RegExp('^(https?:\\/\\/)?'+ // protocol
  307.   '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name
  308.   '((\\d{1,3}\\.){3}\\d{1,3}))' + // OR an IP Address
  309.   '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // port and path
  310.   '(\\?[;&a-z\\d%_.~+=-]*)?' + // query string
  311.   '(\\#[-a-z\\d_]*)?$', 'i'); // fragment locator
  312.   return pattern.test(str);
  313. }
  314.  
  315. /* ====================================================================
  316. == UI FUNCTIONS
  317. ======================================================================= */
  318.  
  319. /* --------------------------------------------------------------------
  320. -- Creates extra options on the menu
  321. ----------------------------------------------------------------------- */
  322.  
  323. function onOpen() {
  324.   var ui = SpreadsheetApp.getUi();
  325.   ui.createMenu('Extra Functions')
  326.       .addItem('Scrape Data', 'scrapeData')
  327.       .addItem('Force Scrape', 'forceScrapeData')
  328.       .addItem('Create Hyperlink','createHyperlink')
  329.       .addToUi();
  330. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement