Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def create_semester_data(data, semester, target):
- #data after fds['dynamic_interm']
- #semester - number of semester
- #target - target name
- #returns data sliced by semester and target as usual, can be taken by Preprocessor()
- new_data = data.copy()
- if target=='drop_probability':
- new_data.dropna(subset=['semester_dropped_number'], inplace=True)
- new_data['drop'] = (new_data['semester_dropped_number'] >= semester).astype(int)
- #print(new_data['semester_dropped_number'].value_counts())
- new_data = new_data.drop(['semester_dropped_number'], axis=1)
- elif target=='debt_probability':
- new_data.dropna(subset=['DEBTS_NUMBER_SEMESTER_'+str(semester)], inplace=True)
- new_data['debt'] = (new_data['DEBTS_NUMBER_SEMESTER_'+str(semester)]).astype(bool).astype(int)
- new_data = new_data.drop(['DEBTS_NUMBER_SEMESTER_'+str(semester)], axis=1)
- elif target == 'mean_debt':
- new_data.dropna(subset=['DEBTS_NUMBER_SEMESTER_'+str(semester)], inplace=True)
- new_data['mean_debt'] = new_data['DEBTS_NUMBER_SEMESTER_'+str(semester)]
- new_data = new_data.drop(['DEBTS_NUMBER_SEMESTER_'+str(semester)], axis=1)
- elif target == 'mean_mark':
- new_data.dropna(subset=['MARKS_MEAN_SEMESTER_'+str(semester)], inplace=True)
- new_data['mean_mark'] = new_data['MARKS_MEAN_SEMESTER_'+str(semester)]
- new_data = new_data.drop(['MARKS_MEAN_SEMESTER_'+str(semester)], axis=1)
- elif target=='scopus_probability':
- new_data.dropna(subset=['SCOPUS_PUBLICATIONS_NUMBER'], inplace=True)
- new_data['scopus'] = (new_data['SCOPUS_PUBLICATIONS_NUMBER']).astype(bool).astype(int)
- new_data = new_data.drop(['SCOPUS_PUBLICATIONS_NUMBER'], axis=1)
- elif target=='publication_probability':
- new_data.dropna(subset=['OTHER_PUBLICATIONS_NUMBER'], inplace=True)
- new_data['publication'] = (new_data['OTHER_PUBLICATIONS_NUMBER']).astype(bool).astype(int)
- new_data = new_data.drop(['OTHER_PUBLICATIONS_NUMBER'], axis=1)
- semesters = np.arange(semester, 9)
- invalid_features = set()
- for elem in semesters:
- invalid_features.update([name for name in data.columns if str(elem) in name])
- features = set(new_data.columns)
- return new_data[features-invalid_features]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement