Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Linq;
- using System.Text;
- using System.Net;
- using System.IO;
- using System.Text.RegularExpressions;
- namespace JLPTWords {
- class Program {
- static void Main(string[] args) {
- string searchTerm = "#jlpt-n3 #word"; // The term to search for on jisho
- searchTerm = searchTerm.Replace(" ", "%20").Replace("#", "%23").Replace("?", "%3F");
- string filePath = GetDirectoryPath(searchTerm);
- Directory.CreateDirectory(filePath);
- DownloadInfo(searchTerm); // Will skip this step if files are already downloaded
- // Delete the \files\ folder if you want a clean download
- GatherInfo(GetDirectoryPath(searchTerm), TranslateToFolderName(searchTerm) + ".txt");
- Console.WriteLine("Press ENTER to terminate...");
- Console.ReadLine();
- }
- public static void DownloadInfo(string searchTerm) {
- byte currentPage = 0;
- WebClient wc = new WebClient();
- wc.Encoding = Encoding.UTF8;
- string result;
- string filePath = GetDirectoryPath(searchTerm);
- Directory.CreateDirectory(filePath);
- do {
- currentPage++;
- if (File.Exists(filePath + "page" + currentPage + ".txt")) {
- Console.WriteLine("File " + "page" + currentPage + ".txt" + " already exists at " + filePath
- + "\nSkipping downloads of pages " + currentPage + " and onward.");
- break;
- }
- Console.WriteLine("Starting download of " + @"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
- result = wc.DownloadString(@"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
- Console.WriteLine("Download finished.");
- File.Create(filePath + "page" + currentPage + ".txt").Dispose();
- Console.WriteLine("Created file " + filePath + "page" + currentPage + ".txt");
- File.WriteAllText(filePath + "page" + currentPage + ".txt", result);
- Console.WriteLine("Wrote to file " + filePath + "page" + currentPage + ".txt\n");
- } while (result.Contains("\">More <span class='accesskey'>W</span>ords"));
- Console.WriteLine("Done! Created " + currentPage + " files at " + filePath + '\n');
- }
- public static void GatherInfo(string directorypath, string outputfilename) {
- string[] files = Directory.GetFiles(directorypath);
- string[] info = new string[files.Length];
- Regex htmltags = new Regex(@"\s*<.+?>\s*");
- if (File.Exists(directorypath + outputfilename)) {
- Console.WriteLine("File " + directorypath + outputfilename + " already exists. Cancelling.");
- return;
- } else
- File.Create(directorypath + outputfilename).Dispose();
- Console.WriteLine("File " + directorypath + outputfilename + " created for gathering data.\n");
- for (int i = 0; i < files.Length; i++) {
- string[] lines = File.ReadAllLines(files[i]);
- for (int j = 0; j < lines.Length; j++)
- if (lines[j].Contains("<span class=\"text\">")) {
- if (info[i] != null)
- info[i] += '\n';
- info[i] += htmltags.Replace(lines[j + 1], "").Trim() + '\t'; // Kanji
- Console.WriteLine("Word found in page file " + (i+1) + " in " + files[i]);
- //info[i] += htmltags.Replace(lines[j - 2], "").Trim() + '\t'; // Reading (Furigana), old way
- info[i] += new Regex("(?:.*?(?:Sentence search for ).*?(?:Sentence search for )(.+?)<\\/a>.*)|(?:.*?(?:Sentence search for )(.+?)<\\/a>.*)")
- .Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1$2") + '\t'; // Reading (Furigana), new way
- string jlpt = new Regex(".+?(JLPT N\\d).+").Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1 ");
- info[i] += (jlpt.Length > 8 ? "" : jlpt) + '\t'; // JLPT level
- string meanings = lines[j + 13].Trim(); // Meaning(s) of the word
- Regex filter = new Regex("(<div class=\"meaning-tags\">)([^(O)]+?)(<\\/div>)");
- meanings = filter.Replace(meanings, "$1[$2]$3"); // Adds [] around meaning-tags
- filter = new Regex("(<span class=\"sense-tag.*?\">)(.+?)(<\\/div>)");
- meanings = filter.Replace(meanings, " $1{$2}$3"); // Adds {} around sense-tags
- filter = new Regex("((<div class=\"sentence\">).*?(<\\/div>))|(<.+?>)|(​)|(\\[Wikipedia definition.+?Read more)");
- meanings = filter.Replace(meanings, ""); // Removes example sentences and all html tags, incl. Wiki def.
- meanings = meanings.Replace("'", "'").Replace(""", "\"");
- filter = new Regex(@"([^\s\(\[\{A-Z0-9])([A-Z]|[0-9]+\.|\[)(?= )(?<!.+\[Notes\].+)|(\S)(\[)|(\])(\S)|([A-Z])([0-9])");
- meanings = filter.Replace(meanings, "$1$3$5$7 $2$4$6$8"); // Adds a space before a number or capital letter
- filter = new Regex("\\s*(Other forms)");
- meanings = filter.Replace(meanings, " | $1: "); // Makes the "Other forms" more clearly separated
- info[i] += meanings;
- }
- }
- Console.WriteLine("\nAll information gathered.");
- using (StreamWriter file = new StreamWriter(directorypath + outputfilename, true)) {
- Console.WriteLine("Started writing ~" + info.Count(x => x != null) * 20 + " lines to file " + directorypath + outputfilename);
- file.WriteLine("Word\tReading\tJLPT\tMeaning");
- foreach (string s in info)
- if (s != null)
- file.WriteLine(s);
- file.Dispose();
- Console.WriteLine("\nWriting finished.");
- }
- }
- public static string GetDirectoryPath(string searchTerm) {
- return Directory.GetCurrentDirectory() + @"\files\" + TranslateToFolderName(searchTerm) + @"\";
- }
- public static string TranslateToFolderName(string searchTerm) {
- return new Regex("[\\<\\>\\:\\\"\\/\\\\\\|\\*]|\\%3F").Replace(searchTerm.Replace("%20", "_").Replace("%23", ""), "");
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement