Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. /**
  2.  * Vaaditaan riippuvuudet: node-fetch sekä node-html-parser
  3.  * Käyttöesimerkki:
  4.  
  5. const FORUM_URL = ""; // esim. "https://www.punkinfinland.net/forum/"
  6. const TOPIC_ID  = ""; // esim. "769221" (kts. https://www.punkinfinland.net/forum/viewtopic.php?t=769221)
  7.  
  8. const crawler = require( "./phpbb_crawler" )({
  9.     logger: false,
  10.     fetch_timeout: 1000,
  11.     forum_url: FORUM_URL
  12. });
  13.  
  14. crawler.fetchTopicPosts( TOPIC_ID ).then( posts => {
  15.     console.log( posts );
  16. });
  17.  
  18. */
  19.  
  20. const fetch = (...args) => import( "node-fetch" ).then(({ default: fetch }) => fetch( ...args ));
  21.  
  22. const HTMLParser = require( "node-html-parser" );
  23.  
  24. module.exports = (() => {
  25.  
  26.     const assets = {};
  27.  
  28.     return opts => {
  29.  
  30.         const FORUM_BASE_URL = opts.forum_url;
  31.  
  32.         const FETCH_TIMEOUT = opts.fetch_timeout == null ? 100 : opts.fetch_timeout;
  33.  
  34.         const logger = opts.logger == false ? { info: ()=>{}, error: ()=>{} } : opts.logger || console;
  35.  
  36.         if (!FORUM_BASE_URL) throw new Error( "Missing FORUM_BASE_URL" );
  37.  
  38.         return { fetchTopicPosts, fetchTopicviewStats, fetchTopicviewPagePosts };
  39.  
  40.         async function fetchDOM( url, force ) {
  41.  
  42.             if (force || !assets[ url ]) {
  43.  
  44.                 // Koska ei haluta rasittaa palvelinta turhan paljoa, odotellaan kutsujen välillä
  45.                 await new Promise( cb => setTimeout( cb, FETCH_TIMEOUT ));
  46.  
  47.                 var response = await fetch( url );
  48.  
  49.                 assets[ url ] = await response.text();
  50.             }
  51.  
  52.             return HTMLParser.parse( assets[ url ] );
  53.         }
  54.  
  55.         async function fetchTopicPosts( TOPIC_ID, force ) {
  56.  
  57.             var stats = await fetchTopicviewStats( TOPIC_ID, force );
  58.  
  59.             logger.info(  `Topic ${ TOPIC_ID } stats:`, stats );
  60.  
  61.             var all_posts = [];
  62.  
  63.             for (var page_n = 1; page_n <= stats.pages_count; page_n++) {
  64.  
  65.                 var posts = await fetchTopicviewPagePosts( TOPIC_ID, stats.posts_per_page * (page_n - 1), force );
  66.  
  67.                 logger.info(  `Got ${ posts.length } post from page ${ page_n }` );
  68.  
  69.                 for (var post of posts) all_posts.push( post );
  70.             }
  71.  
  72.             logger.info( `Ready! Fetched ${ all_posts.length } posts from topic ${ TOPIC_ID }!` );
  73.  
  74.             return all_posts;
  75.         }
  76.  
  77.         async function fetchTopicviewStats( topic_id, force ) {
  78.  
  79.             var DOM = await fetchDOM( `${ FORUM_BASE_URL }viewtopic.php?t=${ topic_id }&start=0` );
  80.  
  81.             var _pagination_elem = DOM.querySelector( "#page-body .action-bar.bar-top > .pagination" );
  82.  
  83.             var posts_count, pages_count = 1, posts_per_page;
  84.  
  85.             try {
  86.                 // Ei toimi aina kirjautuneelle käyttäjälle (mahdollinen "Lukemattomia viestejä"-ilmoitus)
  87.                 // "12 345 posts <ul><li class=....." -> "12345"
  88.                 posts_count = parseInt( _pagination_elem.innerHTML.trim().match( /^[\d\s]+/ )[0].replace( /\s/g, "" ));
  89.             } catch (error) { logger.error( error ); }
  90.  
  91.             try {
  92.                 var _last_page_btn_elem;
  93.                 for (var elem of _pagination_elem.querySelectorAll( "ul > li:not(.arrow)" )) _last_page_btn_elem = elem;
  94.                 pages_count = parseInt( _last_page_btn_elem.text );
  95.             } catch (error) { logger.error( error ); }
  96.  
  97.             try {
  98.                 posts_per_page = DOM.querySelectorAll( "#page-body > .post" ).length;
  99.             } catch (error) { logger.error( error ); }
  100.  
  101.             return { posts_count, pages_count, posts_per_page };
  102.         }
  103.  
  104.         async function fetchTopicviewPagePosts( topic_id, offset, force ) {
  105.  
  106.             var DOM = await fetchDOM( `${ FORUM_BASE_URL }viewtopic.php?t=${ topic_id }&start=${ offset }` );
  107.  
  108.             var posts = DOM.querySelectorAll( "#page-body > .post" ).map( elem => {
  109.  
  110.                 var post = { }
  111.  
  112.                 try { // id="pXXXX" -> XXXX
  113.                     post.id = elem.id.substr( 1 );
  114.                 } catch (error) { logger.error( error ); }
  115.  
  116.                 try {
  117.                     post.username = elem.querySelector( ".postprofile .username, .postprofile .username-coloured" ).textContent;
  118.                 } catch (error) { logger.error( error ); }
  119.  
  120.                 try { // p.author esim.: <span class="posti">... .&raquo; </span>12 Aug 2021, 15:17\n
  121.                     var post_header = elem.querySelector( ".postbody .author" ).innerHTML.trim();
  122.                     post.published_at = post_header.substr( post_header.lastIndexOf(">") + 1 );
  123.                     post.published_at = datetime_to_timestamp( post.published_at );
  124.                 } catch (error) { logger.error( error ); }
  125.  
  126.                 try {
  127.                     post.content = elem.querySelector( ".postbody .content" ).innerHTML;
  128.                 } catch (error) { logger.error( error ); }
  129.  
  130.                 return post;
  131.             });
  132.  
  133.             return posts;
  134.         }
  135.     }
  136. })();
  137.  
  138. // @see https://github.com/widop/phpbb3/blob/master/language/en/common.php
  139. const datetime_matchers = [{
  140.     pattern: /Yesterday, (?<hours>\d+):(?<minutes>\d+)/,
  141.     _parse: (match, now) => (now = new Date( now.getTime() - 24 * 60 * 60 * 1000 ), now.setHours(match.groups.hours), now.setMinutes(match.groups.minutes), now )
  142. }, {
  143.     pattern: /Today, (?<hours>\d+):(?<minutes>\d+)/,
  144.     _parse: (match, now) => (now.setHours(match.groups.hours), now.setMinutes(match.groups.minutes), now )
  145. }, {
  146.     pattern: /1 hour ago/,
  147.     _parse: (match, now) => new Date( now.getTime() - 60 * 60 * 1000 )
  148. }, {
  149.     pattern: /(?<minutes>\d+) minutes? ago/,
  150.     _parse: (match, now) => new Date( now.getTime() - match.groups.minutes * 60 * 1000 )
  151. }, {
  152.     pattern: /less than a minute ago/,
  153.     _parse: (match, now) => now
  154. }];
  155.  
  156. function datetime_to_timestamp( string ) {
  157.  
  158.     var now = new Date();
  159.     now.setSeconds(0);
  160.  
  161.     for (var matcher of datetime_matchers) {
  162.         var match = string.match( matcher.pattern );
  163.         if (match) return matcher._parse ? matcher._parse( match, now ) : match;
  164.     }
  165.  
  166.     return new Date( string ) || string;
  167. }