JLPT Word list from jisho.org

using System;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace JLPTWords {
    class Program {

        static void Main(string[] args) {
            string searchTerm = "#jlpt-n3 #word"; // The term to search for on jisho
            searchTerm = searchTerm.Replace(" ", "%20").Replace("#", "%23").Replace("?", "%3F");

            string filePath = GetDirectoryPath(searchTerm);
            Directory.CreateDirectory(filePath);

            DownloadInfo(searchTerm); // Will skip this step if files are already downloaded
                                      // Delete the \files\ folder if you want a clean download

            GatherInfo(GetDirectoryPath(searchTerm), TranslateToFolderName(searchTerm) + ".txt");

            Console.WriteLine("Press ENTER to terminate...");
            Console.ReadLine();
        }

        public static void DownloadInfo(string searchTerm) {
            byte currentPage = 0;
            WebClient wc = new WebClient();
            wc.Encoding = Encoding.UTF8;
            string result;

            string filePath = GetDirectoryPath(searchTerm);
            Directory.CreateDirectory(filePath);

            do {
                currentPage++;
                if (File.Exists(filePath + "page" + currentPage + ".txt")) {
                    Console.WriteLine("File " + "page" + currentPage + ".txt" + " already exists at " + filePath
                        + "\nSkipping downloads of pages " + currentPage + " and onward.");
                    break;
                }

                Console.WriteLine("Starting download of " + @"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
                result = wc.DownloadString(@"http://jisho.org/search/" + searchTerm + "?page=" + currentPage);
                Console.WriteLine("Download finished.");

                File.Create(filePath + "page" + currentPage + ".txt").Dispose();
                Console.WriteLine("Created file " + filePath + "page" + currentPage + ".txt");

                File.WriteAllText(filePath + "page" + currentPage + ".txt", result);
                Console.WriteLine("Wrote to file " + filePath + "page" + currentPage + ".txt\n");

            } while (result.Contains("\">More <span class='accesskey'>W</span>ords"));

            Console.WriteLine("Done! Created " + currentPage + " files at " + filePath + '\n');
        }

        public static void GatherInfo(string directorypath, string outputfilename) {
            string[] files = Directory.GetFiles(directorypath);
            string[] info = new string[files.Length];
            Regex htmltags = new Regex(@"\s*<.+?>\s*");

            if (File.Exists(directorypath + outputfilename)) {
                Console.WriteLine("File " + directorypath + outputfilename + " already exists. Cancelling.");
                return;
            } else
                File.Create(directorypath + outputfilename).Dispose();
            Console.WriteLine("File " + directorypath + outputfilename + " created for gathering data.\n");

            for (int i = 0; i < files.Length; i++) {
                string[] lines = File.ReadAllLines(files[i]);
                for (int j = 0; j < lines.Length; j++)
                    if (lines[j].Contains("<span class=\"text\">")) {
                        if (info[i] != null)
                            info[i] += '\n';
                        info[i] += htmltags.Replace(lines[j + 1], "").Trim() + '\t'; // Kanji
                        Console.WriteLine("Word found in page file " + (i+1) + " in " + files[i]);
                        //info[i] += htmltags.Replace(lines[j - 2], "").Trim() + '\t'; // Reading (Furigana), old way
                        info[i] += new Regex("(?:.*?(?:Sentence search for ).*?(?:Sentence search for )(.+?)<\\/a>.*)|(?:.*?(?:Sentence search for )(.+?)<\\/a>.*)")
                            .Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1$2") + '\t'; // Reading (Furigana), new way

                        string jlpt = new Regex(".+?(JLPT N\\d).+").Replace(lines[j + 7].Length > 0 ? lines[j + 7] : lines[j + 9], "$1 ");
                        info[i] += (jlpt.Length > 8 ? "" : jlpt) + '\t'; // JLPT level

                        string meanings = lines[j + 13].Trim(); // Meaning(s) of the word
                        Regex filter = new Regex("(<div class=\"meaning-tags\">)([^(O)]+?)(<\\/div>)");
                        meanings = filter.Replace(meanings, "$1[$2]$3"); // Adds [] around meaning-tags

                        filter = new Regex("(<span class=\"sense-tag.*?\">)(.+?)(<\\/div>)");
                        meanings = filter.Replace(meanings, " $1{$2}$3"); // Adds {} around sense-tags

                        filter = new Regex("((<div class=\"sentence\">).*?(<\\/div>))|(<.+?>)|(&#8203;)|(\\[Wikipedia definition.+?Read more)");
                        meanings = filter.Replace(meanings, ""); // Removes example sentences and all html tags, incl. Wiki def.

                        meanings = meanings.Replace("&#39;", "'").Replace("&quot;", "\"");

                        filter = new Regex(@"([^\s\(\[\{A-Z0-9])([A-Z]|[0-9]+\.|\[)(?= )(?<!.+\[Notes\].+)|(\S)(\[)|(\])(\S)|([A-Z])([0-9])");
                        meanings = filter.Replace(meanings, "$1$3$5$7 $2$4$6$8"); // Adds a space before a number or capital letter

                        filter = new Regex("\\s*(Other forms)");
                        meanings = filter.Replace(meanings, " | $1: "); // Makes the "Other forms" more clearly separated

                        info[i] += meanings;
                    }
            }

            Console.WriteLine("\nAll information gathered.");
            using (StreamWriter file = new StreamWriter(directorypath + outputfilename, true)) {
                Console.WriteLine("Started writing ~" + info.Count(x => x != null) * 20 + " lines to file " + directorypath + outputfilename);
                file.WriteLine("Word\tReading\tJLPT\tMeaning");
                foreach (string s in info)
                    if (s != null)
                        file.WriteLine(s);
                file.Dispose();
                Console.WriteLine("\nWriting finished.");
            }
        }

        public static string GetDirectoryPath(string searchTerm) {
            return Directory.GetCurrentDirectory() + @"\files\" + TranslateToFolderName(searchTerm) + @"\";
        }

        public static string TranslateToFolderName(string searchTerm) {
            return new Regex("[\\<\\>\\:\\\"\\/\\\\\\|\\*]|\\%3F").Replace(searchTerm.Replace("%20", "_").Replace("%23", ""), "");
        }
    }
}