Untitled

data.columns = ['text', 'label']
data= data.dropna()
X = data['text']
y = data['label']

# Creating a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Initialize the model you want to use
model = LogisticRegression()  # Change this to your preferred model

# Define the metrics you want to track during cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Define the number of folds for cross-validation
k_folds = 10 # You can change this as needed

# Initialize the StratifiedKFold cross-validator
stratified_kfold = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
results = cross_validate(model, tfidf_vectorizer.fit_transform(X), y, cv=stratified_kfold, scoring=scoring)

# Display the cross-validation results
for metric, values in results.items():
    print(f"{metric.capitalize()} (Mean): {np.mean(values):.4f}")
    print(f"{metric.capitalize()} (Std): {np.std(values):.4f}")

# Exporting the model
import joblib

# Assuming 'model' is your trained machine learning model
# Save the model to a .pkl file
joblib.dump(model, 'suicide_log_reg.pkl')

# Save the tfidf_vectorizer to a .pkl file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')