Advertisement
Skelun

Koikatsu List Scraper V1.21

Nov 2nd, 2023
986
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /* --------------------------------------------------------------------
  2. -- Transforms embedded links into actual Hyperlinks
  3. ----------------------------------------------------------------------- */
  4.  
  5. function createHyperlink() {
  6.   var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  7.  
  8.   // Get the selected range
  9.   var range = sheet.getActiveRange();
  10.   var richTextValues = range.getRichTextValues();
  11.  
  12.   for (var i = 0; i < richTextValues.length; i++) {
  13.     for (var j = 0; j < richTextValues[0].length; j++) {
  14.       var textValue = richTextValues[i][j].getText();
  15.       var link = richTextValues[i][j].getLinkUrl();
  16.  
  17.       if (link) {
  18.         // Creates a hyperlink formula
  19.         var formula = '=HYPERLINK("' + link + '", "Here")';
  20.  
  21.         // Set the formula in the cell
  22.         sheet.getRange(range.getRow() + i, range.getColumn() + j).setFormula(formula);
  23.       }
  24.     }
  25.   }
  26. }
  27.  
  28. /* --------------------------------------------------------------------
  29. -- MOON RATING
  30. -----------------------------------------------------------------------
  31. -- I'm using this because it was the easier way
  32. -- to simulate rating stars.
  33. ----------------------------------------------------------------------- */
  34.  
  35. function moonRating(number) {
  36.   // Is it a number?
  37.   if (!isNaN(number)) {
  38.     var votes;
  39.  
  40.     // Is it an interger?
  41.     if (number % 1 === 0) {
  42.       votes="🌕".repeat(number);
  43.       votes+="🌑".repeat(5-number);
  44.    
  45.     // Or is it a float?
  46.     } else {
  47.       let n = Math.floor(number);
  48.       votes="🌕".repeat(n);
  49.       votes+="🌗";
  50.       votes+="🌑".repeat(4-n);
  51.     }
  52.     return votes;
  53.   } else {
  54.     // If no votes
  55.     return "------"
  56.   }
  57. }
  58.  
  59. /* --------------------------------------------------------------------
  60. -- Scraping data using Cheerio library
  61. -- (https://github.com/tani/cheeriogs)
  62. -- Script ID: 1ReeQ6WO8kKNxoaA_O0XEQ589cIrRvEBA9qcWpNqdOP17i47u6N9M5Xh0
  63. ----------------------------------------------------------------------- */
  64.  
  65. // Cheerio method to scrape
  66. function getContent(url) {
  67.     return UrlFetchApp.fetch(url).getContentText()
  68. }
  69.  
  70. /* --------------------------------------------------------------------
  71. -- Actual scrape function
  72. ----------------------------------------------------------------------- */
  73.  
  74. function scrapeData(force) {
  75.  
  76.   // Selects the active sheet
  77.   var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  78.  
  79.   // Get selected cells
  80.   var selectedRange = sheet.getActiveRange();
  81.  
  82.   // Get the values from the selected cells
  83.   var selectedValues = selectedRange.getRichTextValues();
  84.  
  85.   // Iterate through cells
  86.   for (var i = 0; i < selectedValues.length; i++) {
  87.  
  88.     // It assumes the URLs are in the first column of the selected column
  89.     var urlFromLink = selectedValues[i][0].getLinkUrl();
  90.     var urlFromCellValue = selectedValues[i][0].getText(); // Get text value from the cell
  91.  
  92.     // Check if there is a link URL, and use it if available; otherwise, use the URL from the cell value
  93.     var url = urlFromLink ? urlFromLink : urlFromCellValue;
  94.  
  95.     // If the value is not empty...
  96.     if (url && isValidURL(url)) {
  97.  
  98.       // Grabs the HTML data
  99.       var content = getContent(url);
  100.      
  101.       // Parses the HTML data
  102.       var $ = Cheerio.load(content);
  103.  
  104.       /* --------------------------------------------------------------------
  105.       -- [TITLE]
  106.       -------------------------------------------------------------------- */
  107.       var threadTitle = $('h1.p-title-value');
  108.  
  109.       var rawTitle = threadTitle.contents().filter(function() {
  110.         return this.type === 'text';
  111.       }).text();
  112.  
  113.       var titleREGEX = /(.+?)\s*\[(.+?)\](?:\s*\[(.+?)\])?/.exec(rawTitle);
  114.  
  115.       // We should check if titleREGEX is not null before accessing its indices
  116.       if (titleREGEX) {
  117.         var title = titleREGEX[1];
  118.         var version = titleREGEX[2];
  119.       } else {
  120.         var title = "Title Problem ⚠️";
  121.         var version = "Version Problem ⚠️";        
  122.       }
  123.  
  124.       var devBackup = titleREGEX[3] ? titleREGEX[3] + " ❓" : "Not found ⚠️";
  125.  
  126.       /* --------------------------------------------------------------------
  127.       -- [STATUS] (ACTIVE - ABANDONED - ONHOLD)
  128.       -------------------------------------------------------------------- */
  129.       // Default status
  130.       var status = 'Active';
  131.  
  132.       // Search for "Abandoned" or "Onhold"
  133.       threadTitle.find('span').each(function() {
  134.         var spanText = $(this).text();
  135.         if (['Completed','Abandoned','Onhold'].includes(spanText)) {
  136.           status = spanText;
  137.           return false;
  138.         }
  139.       });
  140.  
  141.       /* --------------------------------------------------------------------
  142.       -- [ENGINE]
  143.       ----------------------------------------------------------------------- */
  144.       // If there's no engine info, it returns "unknown"
  145.       var engine = '⚠️ Unknown';
  146.  
  147.       // Checks if there's one of these strings
  148.       threadTitle.find('span').each(function() {
  149.         var spanText = $(this).text();
  150.         if (['ADRIFT','Flash','Java','Others','QSP','RAGS','RPGM',"Ren'Py",'Tads','Unity','Unreal Engine','WebGL','Wolf RPG','HTML'].includes(spanText)) {
  151.           engine = spanText;
  152.           return false; // Stop searching once a match is found
  153.         }
  154.       });
  155.  
  156.       /* --------------------------------------------------------------------
  157.       -- [TAGS]
  158.       ----------------------------------------------------------------------- */
  159.       // Selects the element containing the tags
  160.       var tagList = $('span.js-tagList');
  161.  
  162.       // Grabs the tags and separate them by comma
  163.       var tags = tagList.find('a.tagItem').map(function() {
  164.         return $(this).text();
  165.       }).get().join(', ');
  166.  
  167.       /* --------------------------------------------------------------------
  168.       -- GAMEPLAY
  169.       ----------------------------------------------------------------------- */
  170.       var gameplay;
  171.  
  172.       // Assumes the [Gameplay] type by using the tags
  173.       switch (true) {
  174.         case tags.includes('sandbox'):
  175.           gameplay = 'Sandbox';
  176.           break;
  177.         case tags.includes('turn based combat'):
  178.           gameplay = 'TBC';
  179.           break;
  180.         case tags.includes('management'):
  181.           gameplay = 'Management';
  182.           break;
  183.         case tags.includes('simulator'):
  184.           gameplay = 'Simulator';
  185.           break;
  186.         case tags.includes('rpg'):
  187.           gameplay = 'RPG';
  188.           break;
  189.         default:
  190.           gameplay = 'Visual Novel';
  191.       }
  192.      
  193.       /* --------------------------------------------------------------------
  194.       -- - Thread [Start] Date
  195.       ----------------------------------------------------------------------- */
  196.  
  197.       // Selects the first 'article' element
  198.       var firstArticle = $('article').first();
  199.  
  200.       // Find the 'header.message-attribution' element
  201.       var messageAttributionHeader = firstArticle.find('header.message-attribution');
  202.  
  203.       // Find the 'time' element with class 'u-dt'
  204.       var timeElement = messageAttributionHeader.find('time.u-dt');
  205.  
  206.       // Check if the time element exists
  207.       if (timeElement.length > 0) {
  208.           // Extract the 'datetime' attribute
  209.           var start = timeElement.attr('datetime');
  210.           start = start.split('T')[0]; // Get the date part only
  211.  
  212.       } else {
  213.           Logger.log("<time> element not found.");
  214.       }
  215.  
  216.       /* --------------------------------------------------------------------
  217.       -- EXTRA INFORMATION
  218.       -----------------------------------------------------------------------
  219.       -- - Thread [Updated] Date
  220.       -- - [Release] Date
  221.       -- - [Dev] name
  222.       -- - [Censored]
  223.       ----------------------------------------------------------------------- */
  224.       // Selects the first message body (which is the one containing the game info)
  225.       var articleText = $('article.message-body').first().text();
  226.  
  227.       // Uses regular expressions to match and capture the information
  228.       // When it was updated
  229.       var updatedREGEX = /Thread Updated: ([^\s]+)/.exec(articleText);
  230.       // When it was released
  231.       var releaseREGEX = /Release Date: ([^\s]+)/.exec(articleText);
  232.       // Who is the dev
  233.       var devREGEX = /Developer: ([^\s]+)/.exec(articleText);
  234.       // It is censored?
  235.       var censoredREGEX = /Censored: (.+?)(?=\n|\r|\r\n)/.exec(articleText);
  236.  
  237.       // Extract the captured values or provide default values if not found
  238.       var updated = updatedREGEX ? updatedREGEX[1] : "Not found ⚠️";
  239.       var release = releaseREGEX ? releaseREGEX[1] : "Not found ⚠️";
  240.      
  241.       //Converts to string due to Google Sheets altering the type
  242.       updated = updated.toString()
  243.       release = release.toString()
  244.  
  245.       var dev = devREGEX ? devREGEX[1] : devBackup;
  246.       var censored = censoredREGEX ? censoredREGEX[1] : "Not found ⚠️";
  247.  
  248.       /* --------------------------------------------------------------------
  249.       -- [RATING] & [VOTES]
  250.       ----------------------------------------------------------------------- */
  251.       // Grabs the JSON generated by F95 forum
  252.       var jsonInfo = $('script[type="application/ld+json"]').last();
  253.      
  254.       // Uses regular expression to match and caputre the information
  255.       var ratingREGEX = /"ratingValue": "(\d+(?:\.\d+)?)"/.exec(jsonInfo);
  256.       var votesREGEX = /"ratingCount": "(\d+)"/.exec(jsonInfo);
  257.  
  258.       // Extract the captured values or provide default values if not found
  259.       var rating = ratingREGEX ? ratingREGEX[1] : "No rating yet";
  260.       var votes = votesREGEX ? votesREGEX[1] : "0";
  261.  
  262.       /* --------------------------------------------------------------------
  263.       -- LAST UPDATE CHECK
  264.       -- --------------------------------------------------------------------
  265.       -- It will ignore the game if it was updated less than 30 days ago
  266.       ----------------------------------------------------------------------- */
  267.  
  268.       // Gets today's date
  269.       var today = new Date();
  270.  
  271.       // Parses the update & release info
  272.       var updatedDate = new Date(updated);
  273.       var releaseDate = new Date(release);
  274.      
  275.       // Subtracts both dates to get days gap
  276.       var daysSinceLastUpdate = Math.floor((today - updatedDate) / (1000 * 60 * 60 * 24));
  277.       var daysSinceRelease = Math.floor((today - releaseDate) / (1000 * 60 * 60 * 24));
  278.  
  279.       if (force === true || daysSinceRelease < 30 || daysSinceLastUpdate > 30) {
  280.  
  281.           /* --------------------------------------------------------------------
  282.           -- OUTPUTS THE VALUE INTO DIFFERENT CELLS
  283.           ----------------------------------------------------------------------- */
  284.  
  285.           // Write the scraped data to the same row
  286.           var outputRow = selectedRange.getRow() + i;
  287.  
  288.           // Write the scraped data to adjacent columns
  289.           var outputColumn = selectedRange.getColumn() + 1;
  290.  
  291.           // All values that will be written
  292.           var newValues = [
  293.               rawTitle,
  294.               title,
  295.               version,
  296.               status,
  297.               engine,
  298.               gameplay,
  299.               tags,
  300.               moonRating(rating),
  301.               votes,
  302.               updated,
  303.               release,
  304.               start,
  305.               dev,
  306.               censored
  307.           ];
  308.  
  309.           // Read existing values in the row
  310.           var rangeToWrite = sheet.getRange(outputRow, outputColumn, 1, newValues.length);
  311.           var existingValues = rangeToWrite.getValues()[0];
  312.  
  313.           // Compare existing values with new values
  314.           for (var j = 0; j < existingValues.length; j++) {
  315.  
  316.             // If the new value is empty, keep the existing value
  317.             if (newValues[j] === "" || newValues[j] == null) {
  318.               newValues[j] = existingValues[j];
  319.             }
  320.           }
  321.  
  322.           // Set only the new non-empty values
  323.           rangeToWrite.setValues([newValues]);
  324.       }
  325.  
  326.     }
  327.  
  328.   }
  329.  
  330. }
  331.  
  332. /* --------------------------------------------------------------------
  333. -- IGNORE DATE RESTRICTION
  334. ----------------------------------------------------------------------- */
  335.  
  336. function forceScrapeData() {
  337.   scrapeData(true);
  338. }
  339.  
  340. /* --------------------------------------------------------------------
  341. -- CHECK IF THE STRING IS A VALID URL
  342. ----------------------------------------------------------------------- */
  343. function isValidURL(str) {
  344.   var pattern = new RegExp('^(https?:\\/\\/)?'+ // protocol
  345.   '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name
  346.   '((\\d{1,3}\\.){3}\\d{1,3}))' + // OR an IP Address
  347.   '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // port and path
  348.   '(\\?[;&a-z\\d%_.~+=-]*)?' + // query string
  349.   '(\\#[-a-z\\d_]*)?$', 'i'); // fragment locator
  350.   return pattern.test(str);
  351. }
  352.  
  353. /* --------------------------------------------------------------------
  354. -- CREATES AN EXTRA MENU OPTION
  355. ----------------------------------------------------------------------- */
  356.  
  357. function onOpen() {
  358.   var ui = SpreadsheetApp.getUi();
  359.   ui.createMenu('Extra Functions')
  360.       .addItem('Scrape Data', 'scrapeData')
  361.       .addItem('Force Scrape', 'forceScrapeData')
  362.       .addItem('Create Hyperlink','createHyperlink')
  363.       .addToUi();
  364. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement