Advertisement
GhostPixels

JLPT Word list from jisho.org

Nov 7th, 2017
390
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 6.66 KB | None | 0 0
  1. using System;
  2. using System.Linq;
  3. using System.Text;
  4. using System.Net;
  5. using System.IO;
  6. using System.Text.RegularExpressions;
  7.  
  8. namespace JLPTWords {
  9.     class Program {
  10.  
  11.         static void Main(string[] args) {
  12.             string searchTerm = "#jlpt-n3 #word"; // The term to search for on jisho
  13.             searchTerm = searchTerm.Replace(" ", "%20").Replace("#", "%23").Replace("?", "%3F");
  14.  
  15.             string filePath = GetDirectoryPath(searchTerm);
  16.             Directory.CreateDirectory(filePath);
  17.  
  18.             DownloadInfo(searchTerm); // Will skip this step if files are already downloaded
  19.                                       // Delete the \files\ folder if you want a clean download
  20.  
  21.             GatherInfo(GetDirectoryPath(searchTerm), TranslateToFolderName(searchTerm) + ".txt");
  22.  
  23.             Console.WriteLine("Press ENTER to terminate...");
  24.             Console.ReadLine();
  25.         }
  26.  
  27.         public static void DownloadInfo(string searchTerm) {
  28.             byte currentPage = 0;
  29.             WebClient wc = new WebClient();
  30.             wc.Encoding = Encoding.UTF8;
  31.             string result;
  32.  
  33.             string filePath = GetDirectoryPath(searchTerm);
  34.             Directory.CreateDirectory(filePath);
  35.  
  36.             do {
  37.                 currentPage++;
  38.                 if (File.Exists(filePath + "page" + currentPage + ".txt")) {
  39.                     Console.WriteLine("File " + "page" + currentPage + ".txt" + " already exists at " + filePath
  40.                         + "\nSkipping downloads of pages " + currentPage + " and onward.");
  41.                     break;
  42.                 }
  43.  
  44.                 Console.WriteLine("Starting download of " + @"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
  45.                 result = wc.DownloadString(@"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
  46.                 Console.WriteLine("Download finished.");
  47.  
  48.                 File.Create(filePath + "page" + currentPage + ".txt").Dispose();
  49.                 Console.WriteLine("Created file " + filePath + "page" + currentPage + ".txt");
  50.  
  51.                 File.WriteAllText(filePath + "page" + currentPage + ".txt", result);
  52.                 Console.WriteLine("Wrote to file " + filePath + "page" + currentPage + ".txt\n");
  53.  
  54.             } while (result.Contains("\">More <span class='accesskey'>W</span>ords"));
  55.  
  56.             Console.WriteLine("Done! Created " + currentPage + " files at " + filePath + '\n');
  57.         }
  58.  
  59.         public static void GatherInfo(string directorypath, string outputfilename) {
  60.             string[] files = Directory.GetFiles(directorypath);
  61.             string[] info = new string[files.Length];
  62.             Regex htmltags = new Regex(@"\s*<.+?>\s*");
  63.  
  64.             if (File.Exists(directorypath + outputfilename)) {
  65.                 Console.WriteLine("File " + directorypath + outputfilename + " already exists. Cancelling.");
  66.                 return;
  67.             } else
  68.                 File.Create(directorypath + outputfilename).Dispose();
  69.             Console.WriteLine("File " + directorypath + outputfilename + " created for gathering data.\n");
  70.  
  71.             for (int i = 0; i < files.Length; i++) {
  72.                 string[] lines = File.ReadAllLines(files[i]);
  73.                 for (int j = 0; j < lines.Length; j++)
  74.                     if (lines[j].Contains("<span class=\"text\">")) {
  75.                         if (info[i] != null)
  76.                             info[i] += '\n';
  77.                         info[i] += htmltags.Replace(lines[j + 1], "").Trim() + '\t'; // Kanji
  78.                         Console.WriteLine("Word found in page file " + (i+1) + " in " + files[i]);
  79.                         //info[i] += htmltags.Replace(lines[j - 2], "").Trim() + '\t'; // Reading (Furigana), old way
  80.                         info[i] += new Regex("(?:.*?(?:Sentence search for ).*?(?:Sentence search for )(.+?)<\\/a>.*)|(?:.*?(?:Sentence search for )(.+?)<\\/a>.*)")
  81.                             .Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1$2") + '\t'; // Reading (Furigana), new way
  82.  
  83.                         string jlpt = new Regex(".+?(JLPT N\\d).+").Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1 ");
  84.                         info[i] += (jlpt.Length > 8 ? "" : jlpt) + '\t'; // JLPT level
  85.  
  86.                         string meanings = lines[j + 13].Trim(); // Meaning(s) of the word
  87.                         Regex filter = new Regex("(<div class=\"meaning-tags\">)([^(O)]+?)(<\\/div>)");
  88.                         meanings = filter.Replace(meanings, "$1[$2]$3"); // Adds [] around meaning-tags
  89.  
  90.                         filter = new Regex("(<span class=\"sense-tag.*?\">)(.+?)(<\\/div>)");
  91.                         meanings = filter.Replace(meanings, " $1{$2}$3"); // Adds {} around sense-tags
  92.  
  93.                         filter = new Regex("((<div class=\"sentence\">).*?(<\\/div>))|(<.+?>)|(&#8203;)|(\\[Wikipedia definition.+?Read more)");
  94.                         meanings = filter.Replace(meanings, ""); // Removes example sentences and all html tags, incl. Wiki def.
  95.  
  96.                         meanings = meanings.Replace("&#39;", "'").Replace("&quot;", "\"");
  97.  
  98.                         filter = new Regex(@"([^\s\(\[\{A-Z0-9])([A-Z]|[0-9]+\.|\[)(?= )(?<!.+\[Notes\].+)|(\S)(\[)|(\])(\S)|([A-Z])([0-9])");
  99.                         meanings = filter.Replace(meanings, "$1$3$5$7 $2$4$6$8"); // Adds a space before a number or capital letter
  100.  
  101.                         filter = new Regex("\\s*(Other forms)");
  102.                         meanings = filter.Replace(meanings, " | $1: "); // Makes the "Other forms" more clearly separated
  103.  
  104.                         info[i] += meanings;
  105.                     }
  106.             }
  107.  
  108.             Console.WriteLine("\nAll information gathered.");
  109.             using (StreamWriter file = new StreamWriter(directorypath + outputfilename, true)) {
  110.                 Console.WriteLine("Started writing ~" + info.Count(x => x != null) * 20 + " lines to file " + directorypath + outputfilename);
  111.                 file.WriteLine("Word\tReading\tJLPT\tMeaning");
  112.                 foreach (string s in info)
  113.                     if (s != null)
  114.                         file.WriteLine(s);
  115.                 file.Dispose();
  116.                 Console.WriteLine("\nWriting finished.");
  117.             }
  118.         }
  119.  
  120.         public static string GetDirectoryPath(string searchTerm) {
  121.             return Directory.GetCurrentDirectory() + @"\files\" + TranslateToFolderName(searchTerm) + @"\";
  122.         }
  123.  
  124.         public static string TranslateToFolderName(string searchTerm) {
  125.             return new Regex("[\\<\\>\\:\\\"\\/\\\\\\|\\*]|\\%3F").Replace(searchTerm.Replace("%20", "_").Replace("%23", ""), "");
  126.         }
  127.     }
  128. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement