Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //------------------------------------------------------------------------------
- /** Объединяет несколько строк в одну, разделяя их символом separator */
- void JoinString(const vector<string> &chnks, char separator, string &result) {
- result.clear();
- for (vector<string>::const_iterator p = chnks.begin();
- p != chnks.end(); ++p) {
- result += *p;
- if (p != chnks.end() - 1)
- result += separator;
- }
- }
- //------------------------------------------------------------------------------
- /** Повторяет слово count раз, разделяя повторения символом separator */
- void RepeatWordWithSeparator(size_t count, const string &word, char separator, string &result) {
- result.clear();
- bool first = true;
- for (size_t i = 0; i < count; ++i) {
- if (!first) {
- result += separator;
- }
- first = false;
- result += word;
- }
- }
- //------------------------------------------------------------------------------
- /** Создает текст документа из слов с заданным числом повторений */
- string GenerateTestDocument(const std::map<string, size_t> words) {
- const char separator = ' ';
- std::vector<std::string> text_parts;
- for (const auto & [term, freq] : words) {
- std::string term_repeated;
- RepeatWordWithSeparator(freq, term, separator, term_repeated);
- text_parts.push_back(term_repeated);
- }
- std::string result;
- JoinString(text_parts, separator, result);
- return result;
- }
- //------------------------------------------------------------------------------
- double CalcTermFrequency(const std::string & term,
- const std::map<std::string, size_t> & all_words) {
- double tf = 0.0;
- if (all_words.count(term)) {
- auto sum = [](size_t accum, const std::pair<std::string, size_t> & item) {
- const auto & [_, count] = item;
- return accum + count;
- };
- const size_t total_words =
- std::accumulate(all_words.begin(), all_words.end(), 0, sum);
- const int term_count = all_words.at(term);
- tf = static_cast<double>(term_count) / total_words;
- }
- return tf;
- }
- //------------------------------------------------------------------------------
- double CalcTermInverseDocumentFrequency(const std::string & term,
- const std::vector<std::map<std::string, size_t>> & documents) {
- auto HasTerm = [&term](const std::map<std::string, size_t> & words) {
- return (0 < words.count(term));
- };
- const size_t documents_with_term_count = std::count_if(documents.begin(),
- documents.end(), HasTerm);
- return log(static_cast<double>(documents.size()) / documents_with_term_count);
- }
- //------------------------------------------------------------------------------
- /** Тест на расчет релевантности документов запросам по TF-IDF */
- void TestTfIdfCalculation() {
- SearchServer server;
- const DocumentStatus status_actual = DocumentStatus::ACTUAL;
- const std::vector<int> ratings = {1,2,3,4,5};
- const std::vector<std::map<std::string, size_t>> documents = {
- {
- { "this"s, 1 },
- { "is"s, 1 },
- { "a"s, 2 },
- { "test"s, 1 },
- },
- {
- { "this"s, 1 },
- { "is"s, 1 },
- { "another"s, 2 },
- { "example"s, 3 },
- },
- };
- // Сперва добавим все документы
- for (int doc_id = 0; doc_id < documents.size(); ++doc_id) {
- const auto & words = documents.at(doc_id);
- const std::string text = GenerateTestDocument(words);
- server.AddDocument(doc_id, text, status_actual, ratings);
- }
- // Обойдем все документы, посчитаем их релевантность по TF-IDF, сравним с той, что возвращает поисковик
- for (int doc_id = 0; doc_id < documents.size(); ++doc_id) {
- const auto & words = documents.at(doc_id);
- for (const auto & [term, count] : words) {
- const double tf = CalcTermFrequency(term, words);
- const double idf = CalcTermInverseDocumentFrequency(term, documents);
- const double expected_relevance = tf * idf;
- const int id = doc_id;
- auto HasId = [id](const Document & doc) {
- return doc.id == id;
- };
- auto search_results = server.FindTopDocuments(term);
- auto iter = std::find_if(search_results.begin(), search_results.end(), HasId);
- ASSERT(iter != search_results.end());
- ASSERT_ALMOST_EQUAL(iter->relevance, expected_relevance, 0.01);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement