Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from pandas import read_csv
- from math import floor, sqrt, exp
- def load_data(path_to_csv, has_header=True, has_rows_names=True):
- if has_header:
- header = 0
- else:
- header = None
- if has_rows_names:
- index = 0
- else:
- index = None
- data = read_csv(path_to_csv, header=header, index_col=index)
- data.fillna('', inplace=True)
- matrix = data.as_matrix()
- if has_header:
- if has_rows_names:
- return matrix, data.columns, data.index
- else:
- return matrix, data.columns
- else:
- if has_rows_names:
- return matrix, data.index
- else:
- return matrix
- def get_index_of_solvability(answers):
- num_students = answers.shape[0]
- num_questions = answers.shape[1]
- solved = answers.sum(axis=0).astype(float)
- return np.array([solved[i] / num_students for i in range(num_questions)])
- def separate_students_answers(answers, group_size):
- solved = answers.sum(axis=1)
- sorted_indices = np.argsort(solved)
- sorted_answers = np.take(answers, sorted_indices, axis=0)
- worst_answers = sorted_answers[0:group_size]
- best_answers = sorted_answers[-group_size:]
- return best_answers, worst_answers
- def get_index_of_dicrimination(answers):
- num_students = answers.shape[0]
- num_questions = answers.shape[1]
- group_fraction = 0.27
- group_size = int(floor(num_students * group_fraction))
- best, worst = separate_students_answers(answers, group_size)
- best_solved = best.sum(axis=0).astype(float)
- worst_solved = worst.sum(axis=0).astype(float)
- best_indices = np.array([best_solved[i] / group_size for i in range(num_questions)])
- worst_indices = np.array([worst_solved[i] / group_size for i in range(num_questions)])
- return best_indices - worst_indices
- def get_questions_statistics(answers, questions):
- index_of_solvability = get_index_of_solvability(answers)
- index_of_discrimination = get_index_of_dicrimination(answers)
- useless_questions = questions[index_of_discrimination < 0.2]
- significant_questions = questions[index_of_discrimination > 0.4]
- difficult_questions = questions[index_of_solvability <= 0.1]
- easy_questions = questions[index_of_solvability >= 0.9]
- return useless_questions, significant_questions, difficult_questions, easy_questions
- def get_shared_answers(answers1, answers2):
- shared_correct_mask = np.array([answers1[i] == answers2[i] and answers1[i] == 1 for i in range(len(answers1))])
- shared_wrong_mask = np.array([answers1[i] == answers2[i] and answers1[i] == 0 for i in range(len(answers1))])
- return np.array([np.sum(shared_correct_mask), np.sum(shared_wrong_mask)])
- def get_neighbors_coefficient(seating_plan, student1, student2):
- loc1 = np.where(seating_plan == student1)
- loc2 = np.where(seating_plan == student2)
- distance = sqrt((loc1[0][0] - loc2[0][0]) ** 2 + (loc1[1][0] - loc2[1][0]) ** 2)
- return 1 - 1 / (1 + exp(6 - 1.2 * distance))
- def detect_possible_cheating(answers, students, seating_plan):
- num_questions = answers.shape[1]
- possible_cheaters = []
- for i in range(len(students)):
- for j in range(i + 1, len(students)):
- left = students[i]
- right = students[j]
- answers_left = answers[i]
- answers_right = answers[j]
- shared = get_shared_answers(answers_left, answers_right)
- total_shared = shared[0] + shared[1]
- a_lot_shared = float(total_shared) / num_questions >= 0.9
- too_many_shared_wrong = float(shared[1]) / num_questions >= 0.2
- all_shared = total_shared == num_questions
- has_shared_wrong = shared[1] > 0
- if (a_lot_shared and too_many_shared_wrong) or (all_shared and has_shared_wrong):
- coeff = get_neighbors_coefficient(seating_plan, left, right)
- if coeff > 0.5:
- value = coeff * (total_shared / num_questions)
- possible_cheaters.append("{}, {}: {}".format(left, right, value))
- return possible_cheaters
- def analyze_test(answers_file_name, seating_plan_file_name):
- answers, questions, students = load_data(answers_file_name)
- seating_plan = load_data(seating_plan_file_name, False, False)
- useless, significant, difficult, easy = get_questions_statistics(answers, questions)
- print("Useless questions: {}\nSignificant questions: {}\nDifficult questions: {}\nEasy questions: {}"
- .format(", ".join(useless), ", ".join(significant), ", ".join(difficult), ", ".join(easy)))
- print("")
- possible_cheaters = detect_possible_cheating(answers, students, seating_plan)
- print("Possible cheaters (with coefficients): \n{}".format("\n".join(possible_cheaters)))
- analyze_test("data/2018-04-16_20-12-08.answ.csv", "data/2018-04-16_20-12-08.seat.csv")
Add Comment
Please, Sign In to add comment