Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* ====================================================================
- == [F95ZONE] THREAD SCRAPER
- =======================================================================
- -- MODIFIED : 2023-11-04
- -- CREATED : 2023-10-27
- -- LICENSE : MIT license (Just not sell it ...)
- -- --------------------------------------------------------------------
- -- Scrapes the data with CheerioGS (https://github.com/tani/cheeriogs)
- -- To use it, first install the library using the Script ID
- -- 1ReeQ6WO8kKNxoaA_O0XEQ589cIrRvEBA9qcWpNqdOP17i47u6N9M5Xh0
- ======================================================================= */
- /* --------------------------------------------------------------------
- -- CheerioGS starting function
- ----------------------------------------------------------------------- */
- function getContent(url) {
- const response = UrlFetchApp.fetch(url, {muteHttpExceptions: true});
- return response.getResponseCode() == 200 ? response.getContentText() : '';
- }
- /* --------------------------------------------------------------------
- -- Actual scrape function
- ----------------------------------------------------------------------- */
- function scrapeData(force) {
- var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
- var selectedRange = sheet.getActiveRange();
- var selectedValues = selectedRange.getRichTextValues();
- for (var i = 0; i < selectedValues.length; i++) {
- // It assumes the URLs are in the first column of the selected column
- var urlFromLink = selectedValues[i][0].getLinkUrl();
- var urlFromCellValue = selectedValues[i][0].getText();
- // Check if there is a link URL, and use it if available; otherwise, use the URL from the cell value
- var url = urlFromLink ? urlFromLink : urlFromCellValue;
- // If the value is not empty...
- if (url && isValidURL(url)) {
- var content = getContent(url);
- var $ = Cheerio.load(content);
- /* --------------------------------------------------------------------
- -- [Title]
- -------------------------------------------------------------------- */
- const threadTitle = $('h1.p-title-value');
- const rawTitle = threadTitle.contents().filter(function() {
- return this.type === 'text';
- }).text();
- const titleREGEX = /(.+?)\s*\[(.+?)\](?:\s*\[(.+?)\])?/.exec(rawTitle);
- var title = titleREGEX && titleREGEX[1] ? titleREGEX[1] : "Title N.F. ⚠️";
- const version = titleREGEX && titleREGEX[2] ? titleREGEX[2] : "Version N.F. ⚠️";
- const devBackup = titleREGEX && titleREGEX[3] ? titleREGEX[3] + " ❓" : "Dev N.F. ⚠️";
- // If there's no regex match, assume rawTitle is the title
- title = !titleREGEX ? rawTitle : title;
- /* --------------------------------------------------------------------
- -- [Status]
- -------------------------------------------------------------------- */
- // Default status
- var status = 'Active';
- // Search for "Completed", "Abandoned", or "Onhold"
- const statusSpan = threadTitle.find('span').filter(function() {
- return [
- 'Completed',
- 'Abandoned',
- 'Onhold'
- ].includes($(this).text());
- }).first();
- statusSpan.length ? status = statusSpan.text() : null;
- /* --------------------------------------------------------------------
- -- [Engine]
- ----------------------------------------------------------------------- */
- // If there's no engine info, it returns "unknown"
- var engine = '⚠️ Unknown';
- const engineSpan = threadTitle.find('span').filter(function() {
- return [
- 'ADRIFT','Flash','Java',
- 'Others','QSP','RAGS',
- 'RPGM',"Ren'Py",'Tads',
- 'Unity','Unreal Engine','WebGL',
- 'Wolf RPG','HTML'
- ].includes($(this).text());
- }).first();
- engineSpan.length ? engine = engineSpan.text() : null;
- /* --------------------------------------------------------------------
- -- [Tags]
- ----------------------------------------------------------------------- */
- // Selects the element containing the tags
- var tagList = $('span.js-tagList');
- // Grabs the tags and separate them by comma
- var tags = tagList.find('a.tagItem').map(function() {
- return $(this).text();
- }).get().join(', ');
- /* --------------------------------------------------------------------
- -- [Gameplay]
- ----------------------------------------------------------------------- */
- const gameplayOptions = {
- 'sandbox': 'Sandbox',
- 'turn based combat': 'TBC',
- 'management': 'Management',
- 'simulator': 'Simulator',
- 'rpg': 'RPG'
- };
- const gameplay = gameplayOptions[Object.keys(gameplayOptions).find(option => tags.includes(option))] || 'Visual Novel';
- /* --------------------------------------------------------------------
- -- Thread [Start] Date
- ----------------------------------------------------------------------- */
- // Selects the first 'article' element
- const timeElement = $('article').first().find('header.message-attribution').find('time.u-dt');
- // Check if the time element exists
- var start = timeElement.length > 0 ? timeElement.attr('datetime').split('T')[0] : (Logger.log("<time> element not found."), null);
- /* --------------------------------------------------------------------
- -- [Updated]
- -- [Release]
- -- [Dev]
- -- [Censored]
- ----------------------------------------------------------------------- */
- // Selects the first message body (which is the one containing the game info)
- const articleText = $('article.message-body').first().text();
- const updated = /Thread Updated:\s*([^\s]+)/.exec(articleText)?.[1].toString() || "Not found ⚠️";
- const release = /(Release|Update) Date:\s*([^\s]+)/.exec(articleText)?.[2] || "Not found ⚠️";
- /* Tries to get the developer's name
- ------------------------------------------------------------------------------- */
- const devStopWords = [
- 'Patreon', 'F95Zone', 'Itch\\.io', 'Discord', 'SubcribeStar', 'Buy Me A Coffee',
- 'Blogspot', 'Website', 'Twitter', 'Pixiv', 'Fanbox', '\\-'
- ];
- // Use the array directly in the regex construction
- const devMatchRegex = new RegExp(`Developer\\s*(?:\\/\\s*Publisher)?\\s*:\\s*((?:(?!${devStopWords.join('|')}).)+)`, 'i');
- // Use the regex to execute a match on articleText
- const devMatch = devMatchRegex.exec(articleText)?.[1]?.trim();
- // Check if devMatch is not undefined, not "You", and different from the devBackup (without warning signs)
- const isDifferentFromBackup = devMatch && devMatch !== "You" && devMatch !== devBackup.replace(" ❓", "").replace(" ⚠️", "").trim();
- // Set dev to devMatch or devBackup. Append a warning sign if it is different from the backup.
- const dev = isDifferentFromBackup ? devMatch + " ⚠️" : devMatch || devBackup;
- Logger.log("Dev: " + dev);
- Logger.log("Backup: " + devBackup);
- /* ---------------------------------------------------------------------------- */
- const censored = /Censored:\s*(.+?)(?=\n|\r|\r\n)/.exec(articleText)?.[1] || "Prob. Not";
- /* --------------------------------------------------------------------
- -- [RATING] & [VOTES]
- ----------------------------------------------------------------------- */
- // Grabs the JSON generated by F95 forum
- const jsonInfo = $('script[type="application/ld+json"]').last();
- // Extract the captured values or provide default values if not found
- const rating = /"ratingValue": "(\d+(?:\.\d+)?)"/.exec(jsonInfo)?.[1] || "No rating yet";
- const votes = /"ratingCount": "(\d+)"/.exec(jsonInfo)?.[1] || "0";
- /* --------------------------------------------------------------------
- -- Last Update Check
- -- --------------------------------------------------------------------
- -- It will not scrape if the thread was updated less than 30 days ago
- ----------------------------------------------------------------------- */
- const today = new Date();
- // Subtracts both dates to get days gap
- const daysSinceLastUpdate = Math.floor((today - new Date(updated)) / (1000 * 60 * 60 * 24));
- const daysSinceRelease = Math.floor((today - new Date(release)) / (1000 * 60 * 60 * 24));
- /* --------------------------------------------------------------------
- -- Starts to write the data into different cells
- ----------------------------------------------------------------------- */
- if (force || daysSinceRelease < 30 || daysSinceLastUpdate > 30) {
- // Write the scraped data to the same row
- var outputRow = selectedRange.getRow() + i;
- // Write the scraped data to adjacent columns
- var outputColumn = selectedRange.getColumn() + 1;
- // All values that will be written
- var newValues = [
- //rawTitle,
- title,
- version,
- dev,
- status,
- engine,
- gameplay,
- tags,
- moonRating(rating),
- votes,
- //updated,
- release,
- censored,
- start
- ];
- /* Creates a note into the [Tags] cell to tell the exact rating
- ------------------------------------------------------------------------------- */
- var tagsIndex = newValues.indexOf(tags);
- tagsIndex !== -1 && sheet.getRange(outputRow, outputColumn + tagsIndex).setNote(tags);
- /* Creates a note into the [moonRating(rating)] cell to tell the exact rating
- ------------------------------------------------------------------------------- */
- var moonRatingIndex = newValues.indexOf(moonRating(rating));
- moonRatingIndex !== -1 && sheet.getRange(outputRow, outputColumn + moonRatingIndex).setNote('Rating: ' + rating);
- /* Ignores the writing of cells with existing values
- ------------------------------------------------------------------------------- */
- // Read existing values in the row
- var rangeToWrite = sheet.getRange(outputRow, outputColumn, 1, newValues.length);
- var existingValues = rangeToWrite.getValues()[0];
- // Compare existing values with new values
- for (var j = 0; j < existingValues.length; j++) {
- // If the new value is empty, keep the existing value
- if (newValues[j] === "" || newValues[j] == null) {
- newValues[j] = existingValues[j];
- }
- }
- // Set only the new non-empty values
- rangeToWrite.setValues([newValues]);
- }
- }
- }
- }
- /* --------------------------------------------------------------------
- -- Ignore all restrictions when scraping
- ----------------------------------------------------------------------- */
- const forceScrapeData = () => scrapeData(true);
- /* --------------------------------------------------------------------
- -- Transforms embedded links into actual Hyperlinks
- ----------------------------------------------------------------------- */
- function createHyperlink() {
- const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
- const range = sheet.getActiveRange();
- const formulas = range.getRichTextValues().map((row) =>
- row.map((richText) => {
- const link = richText.getLinkUrl();
- return link ? `=HYPERLINK("${link}", "Here")` : '';
- })
- );
- // Set formulas in bulk
- range.setFormulas(formulas);
- }
- /* ====================================================================
- == HELPER FUNCTIONS
- ======================================================================= */
- /* --------------------------------------------------------------------
- -- MoonRating -- Simulates rating stars with emojis.
- ----------------------------------------------------------------------- */
- function moonRating(number) {
- if (isNaN(number)) return "------";
- const fullMoon = "🌕", halfMoon = "🌗", newMoon = "🌑";
- const moons = Math.floor(number);
- const isHalfMoon = number % 1 !== 0;
- return fullMoon.repeat(moons) +
- (isHalfMoon ? halfMoon : "") +
- newMoon.repeat(5 - moons - (isHalfMoon ? 1 : 0));
- }
- /* --------------------------------------------------------------------
- -- Check if the strings is a valid URL
- ----------------------------------------------------------------------- */
- function isValidURL(str) {
- var pattern = new RegExp('^(https?:\\/\\/)?'+ // protocol
- '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name
- '((\\d{1,3}\\.){3}\\d{1,3}))' + // OR an IP Address
- '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // port and path
- '(\\?[;&a-z\\d%_.~+=-]*)?' + // query string
- '(\\#[-a-z\\d_]*)?$', 'i'); // fragment locator
- return pattern.test(str);
- }
- /* ====================================================================
- == UI FUNCTIONS
- ======================================================================= */
- /* --------------------------------------------------------------------
- -- Creates extra options on the menu
- ----------------------------------------------------------------------- */
- function onOpen() {
- var ui = SpreadsheetApp.getUi();
- ui.createMenu('Extra Functions')
- .addItem('Scrape Data', 'scrapeData')
- .addItem('Force Scrape', 'forceScrapeData')
- .addItem('Create Hyperlink','createHyperlink')
- .addToUi();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement