Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Threading.Tasks;
- namespace baes //Math.Log(s.Length / k);
- {
- class Program
- {
- /*static double bayes(int Dc, int D, int v, string[][] spl, string[] Q, int i)
- {
- double answer = 0;
- for (int j = 0; j < Q.Length; j++)
- answer += Math.Log((count_word(Q[j], spl[i]) + 1) / (v + spl[i].Length));
- answer += Math.Log(Dc, D);
- return answer;
- }
- static int find_max(int[] Dc, int D, int v, string[][] spl, string[] Q)
- {
- double max = 0;
- int k = 0;
- for(int i = 0; i < spl.Length; i++)
- if (bayes(Dc[i], D, v, spl, Q, i) > max)
- {
- max = bayes(Dc[i], D, v, spl, Q, i);
- k = i;
- }
- return k;
- }*/
- static int count_word(string word, string[] s)
- {
- int count = 0;
- foreach (string item in s)
- if (item.Contains(word))//(item == word)
- {
- //Console.Write(word + " " + item + " " + s.Contains(item) + "\n");
- count++;
- }
- //Console.Write(count + "\n");
- return count;
- }
- static void Main()
- {
- Console.OutputEncoding = Encoding.UTF8;
- string[] lines = System.IO.File.ReadAllLines(@"D:\Study\news_trainC2.txt");
- /*string[] lines = System.IO.File.ReadAllLines(@"D:\Study\news_output_example.txt");
- Console.Write(lines.Length+"\n");
- for(int i = 0; i < lines.Length; i++)
- Console.Write(lines[i] + "\n");*/
- string[][] split = new string[lines.Length][];
- char[] delimiter = { '.', ' ', '!', '"', '-', '\t', '\n', ',', '(', ')', '\"', '?', ':', ';', '«', '»', '/' };
- Console.Write(1 + "\n");
- //выделяем слова.
- for (int i = 0; i < split.Length; i++)
- split[i] = lines[i].Split(delimiter);
- Console.Write(2 + "\n");
- //объединяем в темы.
- string[] theme = new string[10];
- for (int i = 0; i < 10; i++)
- theme[i] = null;
- int[] Dc = new int[10];
- for (int i = 0; i < split.Length; i++)
- {
- if (string.Compare(split[i][0], "science") == 0)
- {
- theme[0] = string.Concat(theme[0], lines[i]);
- Dc[0]++;
- }
- if (string.Compare(split[i][0], "style") == 0)
- {
- theme[1] = string.Concat(theme[1], lines[i]);
- Dc[1]++;
- }
- if (string.Compare(split[i][0], "culture") == 0)
- {
- theme[2] = string.Concat(theme[2], lines[i]);
- Dc[2]++;
- }
- if (string.Compare(split[i][0], "life") == 0)
- {
- theme[3] = string.Concat(theme[3], lines[i]);
- Dc[3]++;
- }
- if (string.Compare(split[i][0], "economics") == 0)
- {
- theme[4] = string.Concat(theme[4], lines[i]);
- Dc[4]++;
- }
- if (string.Compare(split[i][0], "business") == 0)
- {
- theme[5] = string.Concat(theme[5], lines[i]);
- Dc[5]++;
- }
- if (string.Compare(split[i][0], "travel") == 0)
- {
- theme[6] = string.Concat(theme[6], lines[i]);
- Dc[6]++;
- }
- if (string.Compare(split[i][0], "forces") == 0)
- {
- theme[7] = string.Concat(theme[7], lines[i]);
- Dc[7]++;
- }
- if (string.Compare(split[i][0], "media") == 0)
- {
- theme[8] = string.Concat(theme[8], lines[i]);
- Dc[8]++;
- }
- if (string.Compare(split[i][0], "sport") == 0)
- {
- theme[9] = string.Concat(theme[9], lines[i]);
- Dc[9]++;
- }
- }
- for (int i = 0; i < Dc.Length; i++)
- Console.Write("Dc[{0}]: {1}; ", i, Dc[i]);
- Console.Write("\n" + 3 + "\n");
- Console.Write(theme[0]);
- int D = lines.Length;
- string[][] spl = new string[theme.Length][];
- for (int i = 0; i < spl.Length; i++)
- spl[i] = theme[i].Split(delimiter);
- Console.Write("D = " +D + "\n" + 4 + "\n");
- //удаляем последний символ в кажом слове.
- for (int i = 0; i < spl.Length; i++)
- for (int j = 1; j < spl[i].Length; j++)
- if (spl[i][j].Length > 1)
- {
- spl[i][j] = spl[i][j].Substring(0, spl[i][j].Length - 1);
- //Console.Write(spl[i][j] + "\n");
- }
- Console.Write(5 + "\n");
- //убираем слова, сост. из 1, 2, 3 символов
- int[] k = new int[10];
- for (int i = 0; i < spl.Length; i++)
- {
- k[i] = 0;
- for (int j = 0; j < spl[i].Length; j++)
- if (spl[i][j].Length > 3)
- {
- spl[i][k[i]] = spl[i][j];
- k[i]++;
- }
- Array.Resize<string>(ref spl[i], k[i]);
- }
- //foreach (string item in spl[0])
- // Console.Write(item + " ");
- Console.Write(6 + "\n");
- //считаем количество уникальных слов во всех документах обучающей выборки
- int v = 0;
- for (int i = 0; i < spl.Length; i++)
- {
- IEnumerable<string> distinct = spl[i].Distinct();
- foreach (string item in distinct)
- v++;
- }
- Console.Write("v = " + v + "\n" + 7 + "\n");
- //читаем test
- //int q = 0;
- string[] test = System.IO.File.ReadAllLines(@"D:\Study\news_test.txt");
- string[][] split_test = new string[test.Length][];
- //Console.Write(split_test.Length);
- for (int i = 0; i < split_test.Length; i++)
- split_test[i] = test[i].Split(delimiter);
- Console.Write(8 + "\n");
- //удаляем первый и 2 последних символ в кажом слове.
- for (int i = 0; i < split_test.Length; i++)
- for (int j = 1; j < split_test[i].Length; j++)
- if (split_test[i][j].Length > 3)
- {
- split_test[i][j] = split_test[i][j].Substring(1, split_test[i][j].Length - 2);
- }
- Console.Write(9 + "\n");
- //убираем слова, сост. из 1, 2, 3 символов
- int[] k1 = new int[split_test.Length];
- for (int i = 0; i < split_test.Length; i++)
- {
- k1[i] = 0;
- for (int j = 0; j < split_test[i].Length; j++)
- if (split_test[i][j].Length > 3)
- {
- split_test[i][k1[i]] = split_test[i][j];
- //Console.Write(split_test[i][k1[i]] + "\n");
- k1[i]++;
- }
- Array.Resize<string>(ref split_test[i], k1[i]);
- }
- Console.Write(10 + "\n");
- //гадаем
- //Console.Write(spl[0].Length+"\n\n");
- //Console.Write(count_word(split_test[0][1], spl[0])+"\n\n");
- string[] answer = new string[test.Length];
- for (int i = 0; i < test.Length; i++)
- {
- Console.Write("{0}-ый шаг: ", i);
- double max = 0;
- int a = 0;
- for (int them = 0; them < spl.Length; them++)
- {
- double bayes = 0;
- for (int j = 0; j < split_test[i].Length; j++)
- {
- //Console.Write("them=" + them + "; " + "j=" + j + "; " + "count = {0}; spl[them.Length] = {1}",count_word(split_test[i][j], spl[them]), spl[them].Length + "\n");
- //bayes += Math.Log((count_word(split_test[i][j], spl[them]) + 1) / (v + spl[them].Length));
- if (count_word(split_test[i][j], spl[them]) != 0)
- bayes += Math.Log((count_word(split_test[i][j], spl[them]) + 1) * 100000 / (v + spl[them].Length));
- //Console.Write("bayes = " + bayes + "\n");
- }
- bayes += Math.Log(Dc[them] * 100000 / D);
- //bayes *= Dc[them] / D;
- //Console.Write(bayes + " ");
- if (bayes > max)
- {
- max = bayes;
- a = them;
- }
- }
- //Console.Write(a + " ");
- if (a == 0)
- {
- answer[i] = "science";
- //answer = string.Concat(answer, "science");
- Console.Write("science\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "science");
- if (a == 1)
- {
- answer[i] = "style";
- //answer = string.Concat(answer, "style");
- Console.Write("style\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "style");
- if (a == 2)
- {
- answer[i] = "culture";
- //answer = string.Concat(answer, "culture");
- Console.Write("culture\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "culture");
- if (a == 3)
- {
- answer[i] = "life";
- //answer = string.Concat(answer, "life");
- Console.Write("life\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "life");
- if (a == 4)
- {
- answer[i] = "economics";
- //answer = string.Concat(answer, "economics");
- Console.Write("economics\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "economics");
- if (a == 5)
- {
- answer[i] = "business";
- //answer = string.Concat(answer, "business");
- Console.Write("business\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "business");
- if (a == 6)
- {
- answer[i] = "travel";
- //answer = string.Concat(answer, "travel");
- Console.Write("travel\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "travel");
- if (a == 7)
- {
- answer[i] = "forces";
- //answer = string.Concat(answer, "forces");
- Console.Write("forces\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "forces");
- if (a == 8)
- {
- answer[i] = "media";
- //answer = string.Concat(answer, "media");
- Console.Write("media\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "media");
- if (a == 9)
- {
- answer[i] = "sport";
- //answer = string.Concat(answer, "sport");
- Console.Write("sport\n");
- }
- //System.IO.File.WriteAllText(@"D:\Study\outC.txt", "sport");
- //Console.Write(find_max(Dc, D, v, spl, split_test[i]) + " ");
- }
- //Console.Write(answer);
- System.IO.File.WriteAllLines(@"D:\Study\out.txt", answer, Encoding.UTF8);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement