Untitled

import numpy as np
from pandas import read_csv
from math import floor, sqrt, exp


def load_data(path_to_csv, has_header=True, has_rows_names=True):
    if has_header:
        header = 0
    else:
        header = None
    if has_rows_names:
        index = 0
    else:
        index = None
    data = read_csv(path_to_csv, header=header, index_col=index)
    data.fillna('', inplace=True)
    matrix = data.as_matrix()
    if has_header:
        if has_rows_names:
            return matrix, data.columns, data.index
        else:
            return matrix, data.columns
    else:
        if has_rows_names:
            return matrix, data.index
        else:
            return matrix


def get_index_of_solvability(answers):
    num_students = answers.shape[0]
    num_questions = answers.shape[1]

    solved = answers.sum(axis=0).astype(float)
    return np.array([solved[i] / num_students for i in range(num_questions)])


def separate_students_answers(answers, group_size):
    solved = answers.sum(axis=1)
    sorted_indices = np.argsort(solved)
    sorted_answers = np.take(answers, sorted_indices, axis=0)

    worst_answers = sorted_answers[0:group_size]
    best_answers = sorted_answers[-group_size:]
    return best_answers, worst_answers


def get_index_of_dicrimination(answers):
    num_students = answers.shape[0]
    num_questions = answers.shape[1]

    group_fraction = 0.27
    group_size = int(floor(num_students * group_fraction))

    best, worst = separate_students_answers(answers, group_size)
    best_solved = best.sum(axis=0).astype(float)
    worst_solved = worst.sum(axis=0).astype(float)
    best_indices = np.array([best_solved[i] / group_size for i in range(num_questions)])
    worst_indices = np.array([worst_solved[i] / group_size for i in range(num_questions)])
    return best_indices - worst_indices


def get_questions_statistics(answers, questions):
    index_of_solvability = get_index_of_solvability(answers)
    index_of_discrimination = get_index_of_dicrimination(answers)

    useless_questions = questions[index_of_discrimination < 0.2]
    significant_questions = questions[index_of_discrimination > 0.4]
    difficult_questions = questions[index_of_solvability <= 0.1]
    easy_questions = questions[index_of_solvability >= 0.9]

    return useless_questions, significant_questions, difficult_questions, easy_questions


def get_shared_answers(answers1, answers2):
    shared_correct_mask = np.array([answers1[i] == answers2[i] and answers1[i] == 1 for i in range(len(answers1))])
    shared_wrong_mask = np.array([answers1[i] == answers2[i] and answers1[i] == 0 for i in range(len(answers1))])
    return np.array([np.sum(shared_correct_mask), np.sum(shared_wrong_mask)])


def get_neighbors_coefficient(seating_plan, student1, student2):
    loc1 = np.where(seating_plan == student1)
    loc2 = np.where(seating_plan == student2)
    distance = sqrt((loc1[0][0] - loc2[0][0]) ** 2 + (loc1[1][0] - loc2[1][0]) ** 2)
    return 1 - 1 / (1 + exp(6 - 1.2 * distance))


def detect_possible_cheating(answers, students, seating_plan):
    num_questions = answers.shape[1]
    possible_cheaters = []
    for i in range(len(students)):
        for j in range(i + 1, len(students)):
            left = students[i]
            right = students[j]
            answers_left = answers[i]
            answers_right = answers[j]
            shared = get_shared_answers(answers_left, answers_right)
            total_shared = shared[0] + shared[1]
            a_lot_shared = float(total_shared) / num_questions >= 0.9
            too_many_shared_wrong = float(shared[1]) / num_questions >= 0.2
            all_shared = total_shared == num_questions
            has_shared_wrong = shared[1] > 0
            if (a_lot_shared and too_many_shared_wrong) or (all_shared and has_shared_wrong):
                coeff = get_neighbors_coefficient(seating_plan, left, right)
                if coeff > 0.5:
                    value = coeff * (total_shared / num_questions)
                    possible_cheaters.append("{}, {}: {}".format(left, right, value))

    return possible_cheaters


def analyze_test(answers_file_name, seating_plan_file_name):
    answers, questions, students = load_data(answers_file_name)
    seating_plan = load_data(seating_plan_file_name, False, False)

    useless, significant, difficult, easy = get_questions_statistics(answers, questions)
    print("Useless questions: {}\nSignificant questions: {}\nDifficult questions: {}\nEasy questions: {}"
          .format(", ".join(useless), ", ".join(significant), ", ".join(difficult), ", ".join(easy)))
    print("")
    possible_cheaters = detect_possible_cheating(answers, students, seating_plan)
    print("Possible cheaters (with coefficients): \n{}".format("\n".join(possible_cheaters)))


analyze_test("data/2018-04-16_20-12-08.answ.csv", "data/2018-04-16_20-12-08.seat.csv")