rplantiko

Extract geographical coordinates from wikipedia pages of locations

Jan 4th, 2022 (edited)
815
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. "use strict";
  2.  
  3. const fs      = require('fs').promises;
  4. const html    = require('node-html-parser');
  5. const request = require('request');
  6.  
  7. // - Read HTML document INPUT_FILE from file system
  8. // - Extract all <a> tags
  9. //
  10. //   The <a href> links are supposed to point to wikipedia articles of locations
  11. //   Therefore, following the wikipedia convention, there should be
  12. //   elements of class name "longitude" and "latitude",
  13. //   specifying the geographical coordinates of the location
  14. //
  15. // - Read the HTML content of the specified links from the internet
  16. // - Extract coordinates from selectors ".longitude" and ".latitude"
  17. // - Output results in csv format:
  18. //     nr,name,url,lon,lat
  19. //   Here, nr is the running index of the array of <a> elements of INPUT_FILE
  20.  
  21. // Parameters, by default or from the command line
  22. const [
  23.   INPUT_FILE,    // Name of the HTML input file containing links
  24.   CHUNK_SIZE,    // Chunk size for parallel HTTP(S) request execution
  25.   DELAY          // Delay in ms between chunk execution    
  26.   ] = getParameters.apply(process.argv.slice(2));
  27.  
  28. run();
  29.      
  30.  
  31. async function run() {
  32.   try {
  33.     const fileContent = await fs.readFile(INPUT_FILE, 'utf8');
  34.     const allGeoPos = await resolveAllGeoPosFromLinks( fileContent );
  35.     generateCSV(allGeoPos);
  36.   } catch(err) {
  37.     console.error(err);
  38.   }
  39. }
  40.  
  41. // Extract coordinates of all given locations linked in INPUT_FILE
  42. async function resolveAllGeoPosFromLinks( inputFileHTML ) {
  43.  
  44.   let allGeoPos = [], i=0;
  45.   const links = extractLinks( inputFileHTML );
  46.   const chunks = chunk( links, CHUNK_SIZE );
  47.   for (let c of chunks) {
  48.     await doChunk(c);
  49.     if (DELAY>0) await delay(DELAY);
  50.   };
  51.   return allGeoPos;
  52.  
  53.   function doChunk(c) {
  54.     let pChunk = c.map( a=>getGeoPos(a,++i).then(x=>allGeoPos.push(x)) );
  55.     return Promise.all( pChunk );  
  56.   }
  57.  
  58. }
  59.  
  60.  
  61. // Promise to extract coordinates from this location at some time
  62. function getGeoPos(a,i) {
  63.   return new Promise((resolve,reject)=>{  
  64.     const url = encodeURI( a.href );
  65.     fetchURL(url).then( htmlContent => {
  66.       let pos = extractPos(htmlContent,a,i);
  67.       resolve(pos);
  68.     });
  69.   });
  70. }
  71.  
  72. // Extract coordinates from wikipedia page of location
  73. function extractPos(htmlContent,a,i) {
  74.   const doc = html.parse( htmlContent );
  75.   const coord = [".longitude", ".latitude" ].map(selectText);
  76.   return {
  77.     nr:i,
  78.     name:a.name,
  79.     href:a.href,
  80.     lon:coord[0],
  81.     lat:coord[1]
  82.   };
  83.   function selectText(selector) {
  84.     var element = doc.querySelector(selector);
  85.     return element && element.text || "";
  86.   }
  87. }
  88.  
  89. function generateCSV(results) {
  90. // Sort by index of occurrence in input file
  91.   results.sort((a,b)=>a.nr*1-b.nr);
  92. // Prepare data rows in .csv format  
  93.   const csv = results.map(generateCSVRow).join('\n');  
  94. // Put result to standard output
  95.   console.log(csv);  
  96.   function generateCSVRow( v ) {
  97.     return `${v.nr},${v.name},${v.href},${v.lon},${v.lat}`;
  98.   }
  99. }
  100.  
  101. // Parameters, by default or read from command line
  102. function getParameters(
  103.   file_name = 'orte-w.html',
  104.   chunk_size = 5,
  105.   delay = 200  
  106. ) {
  107.   return [ file_name, chunk_size, delay ];
  108. }
  109.  
  110. // General: parse HTML source code and extract all <a> hyperlinks
  111. function extractLinks( htmlSourceCode ) {
  112.   const doc = html.parse( htmlSourceCode );
  113.   return doc.querySelectorAll("a").map( extractNameAndHref );
  114.   function extractNameAndHref(a) {
  115.     return {
  116.       name:a.text,
  117.       href:a.getAttribute("href")
  118.     }
  119.   }
  120. }
  121.  
  122. // General: setTimeout(), but as a promise
  123. function delay(timeToDelay) {
  124.   return new Promise(
  125.     (resolve) => setTimeout(resolve, timeToDelay)
  126.   );
  127. }  
  128.  
  129.  
  130. // General: Promise to perform a delayed HTTP(S) request
  131. // Delay = 0 just waits for the event loop to be finished
  132. function fetchURLDelayed(url,delay=0) {
  133.   return new Promise(function (resolve, reject) {
  134.     setTimeout(
  135.       ()=>fetchURL(url)(resolve,reject),
  136.       delay)
  137.   });
  138. }
  139.  
  140. // General: Promise to perform an HTTP(S) request
  141. function fetchURL(url) {
  142.   return new Promise(function (resolve, reject) {
  143.     request(url, (error, response, body) => {
  144.       if (error) return reject(error);
  145.       if (response.statusCode != 200) {
  146.         return reject('Invalid status code <' + response.statusCode + '> for url '+url);
  147.       }
  148.     return resolve(body);
  149.     })
  150.   });
  151. }
  152.  
  153. // General: Split an array into chunks of size "chunkSize"
  154. function chunk(a,chunkSize) {
  155.   let chunks = [],i = 0,n = a.length;
  156.   while (i < n) {
  157.     chunks.push(a.slice(i, i += chunkSize));
  158.   }
  159.   return chunks;
  160. }
Add Comment
Please, Sign In to add comment