/**
* Vaaditaan riippuvuudet: node-fetch sekä node-html-parser
* Käyttöesimerkki:
const FORUM_URL = ""; // esim. "https://www.punkinfinland.net/forum/"
const TOPIC_ID = ""; // esim. "769221" (kts. https://www.punkinfinland.net/forum/viewtopic.php?t=769221)
const crawler = require( "./phpbb_crawler" )({
logger: false,
fetch_timeout: 1000,
forum_url: FORUM_URL
});
crawler.fetchTopicPosts( TOPIC_ID ).then( posts => {
console.log( posts );
});
*/
const fetch = (...args) => import( "node-fetch" ).then(({ default: fetch }) => fetch( ...args ));
const HTMLParser = require( "node-html-parser" );
module.exports = (() => {
const assets = {};
return opts => {
const FORUM_BASE_URL = opts.forum_url;
const FETCH_TIMEOUT = opts.fetch_timeout == null ? 100 : opts.fetch_timeout;
const logger = opts.logger == false ? { info: ()=>{}, error: ()=>{} } : opts.logger || console;
if (!FORUM_BASE_URL) throw new Error( "Missing FORUM_BASE_URL" );
return { fetchTopicPosts, fetchTopicviewStats, fetchTopicviewPagePosts };
async function fetchDOM( url, force ) {
if (force || !assets[ url ]) {
// Koska ei haluta rasittaa palvelinta turhan paljoa, odotellaan kutsujen välillä
await new Promise( cb => setTimeout( cb, FETCH_TIMEOUT ));
var response = await fetch( url );
assets[ url ] = await response.text();
}
return HTMLParser.parse( assets[ url ] );
}
async function fetchTopicPosts( TOPIC_ID, force ) {
var stats = await fetchTopicviewStats( TOPIC_ID, force );
logger.info( `Topic ${ TOPIC_ID } stats:`, stats );
var all_posts = [];
for (var page_n = 1; page_n <= stats.pages_count; page_n++) {
var posts = await fetchTopicviewPagePosts( TOPIC_ID, stats.posts_per_page * (page_n - 1), force );
logger.info( `Got ${ posts.length } post from page ${ page_n }` );
for (var post of posts) all_posts.push( post );
}
logger.info( `Ready! Fetched ${ all_posts.length } posts from topic ${ TOPIC_ID }!` );
return all_posts;
}
async function fetchTopicviewStats( topic_id, force ) {
var DOM = await fetchDOM( `${ FORUM_BASE_URL }viewtopic.php?t=${ topic_id }&start=0` );
var _pagination_elem = DOM.querySelector( "#page-body .action-bar.bar-top > .pagination" );
var posts_count, pages_count = 1, posts_per_page;
try {
// Ei toimi aina kirjautuneelle käyttäjälle (mahdollinen "Lukemattomia viestejä"-ilmoitus)
// "12 345 posts <ul><li class=....." -> "12345"
posts_count = parseInt( _pagination_elem.innerHTML.trim().match( /^[\d\s]+/ )[0].replace( /\s/g, "" ));
} catch (error) { logger.error( error ); }
try {
var _last_page_btn_elem;
for (var elem of _pagination_elem.querySelectorAll( "ul > li:not(.arrow)" )) _last_page_btn_elem = elem;
pages_count = parseInt( _last_page_btn_elem.text );
} catch (error) { logger.error( error ); }
try {
posts_per_page = DOM.querySelectorAll( "#page-body > .post" ).length;
} catch (error) { logger.error( error ); }
return { posts_count, pages_count, posts_per_page };
}
async function fetchTopicviewPagePosts( topic_id, offset, force ) {
var DOM = await fetchDOM( `${ FORUM_BASE_URL }viewtopic.php?t=${ topic_id }&start=${ offset }` );
var posts = DOM.querySelectorAll( "#page-body > .post" ).map( elem => {
var post = { }
try { // id="pXXXX" -> XXXX
post.id = elem.id.substr( 1 );
} catch (error) { logger.error( error ); }
try {
post.username = elem.querySelector( ".postprofile .username, .postprofile .username-coloured" ).textContent;
} catch (error) { logger.error( error ); }
try { // p.author esim.: <span class="posti">... .» </span>12 Aug 2021, 15:17\n
var post_header = elem.querySelector( ".postbody .author" ).innerHTML.trim();
post.published_at = post_header.substr( post_header.lastIndexOf(">") + 1 );
post.published_at = datetime_to_timestamp( post.published_at );
} catch (error) { logger.error( error ); }
try {
post.content = elem.querySelector( ".postbody .content" ).innerHTML;
} catch (error) { logger.error( error ); }
return post;
});
return posts;
}
}
})();
// @see https://github.com/widop/phpbb3/blob/master/language/en/common.php
const datetime_matchers = [{
pattern: /Yesterday, (?<hours>\d+):(?<minutes>\d+)/,
_parse: (match, now) => (now = new Date( now.getTime() - 24 * 60 * 60 * 1000 ), now.setHours(match.groups.hours), now.setMinutes(match.groups.minutes), now )
}, {
pattern: /Today, (?<hours>\d+):(?<minutes>\d+)/,
_parse: (match, now) => (now.setHours(match.groups.hours), now.setMinutes(match.groups.minutes), now )
}, {
pattern: /1 hour ago/,
_parse: (match, now) => new Date( now.getTime() - 60 * 60 * 1000 )
}, {
pattern: /(?<minutes>\d+) minutes? ago/,
_parse: (match, now) => new Date( now.getTime() - match.groups.minutes * 60 * 1000 )
}, {
pattern: /less than a minute ago/,
_parse: (match, now) => now
}];
function datetime_to_timestamp( string ) {
var now = new Date();
now.setSeconds(0);
for (var matcher of datetime_matchers) {
var match = string.match( matcher.pattern );
if (match) return matcher._parse ? matcher._parse( match, now ) : match;
}
return new Date( string ) || string;
}