""" This script processes a CSV dataset containing preprocessed textual data, applies a trained Naive Bayes model for classification, and sends the resulting predictions to an InfluxDB instance. Key Steps: 1. Load a CSV file with textual data and corresponding dates. 2. Apply random timestamps to ensure unique date entries. 3. Use a pre-trained Naive Bayes model to classify the 'cleaned_text' column. 4. Vectorize the text using TF-IDF. 5. Append predictions to the dataset and print counts of target labels. 6. Send the processed data, including 'actual_target' and 'predicted_target', to InfluxDB. Requirements: - InfluxDB client library (`influxdb_client`) - Scikit-learn (`sklearn`) - Pandas (`pandas`) - Pickle module for loading pre-trained models """ import os import pandas as pd from influxdb_client import InfluxDBClient from influxdb_client.client.write_api import SYNCHRONOUS import pickle from sklearn.feature_extraction.text import TfidfVectorizer import random from datetime import timedelta # To ignore warnings import warnings warnings.filterwarnings("ignore") # InfluxDB credentials and configuration org = "Organization Name" token = "Token" url = "http://127.0.0.1:8086" bucket = "Bucket Name" # Base directory (this script and the subfolders are in the same parent directory) base_dir = os.path.dirname(os.path.dirname(__file__)) # File paths for the dataset and model csv_path = os.path.join( base_dir, "Textual_Analysis", "Dataset", "Preprocessed_Text_Dataset.csv" ) model_path = os.path.join( base_dir, "Textual_Analysis", "Models", "Classification", "pkl_models", "Naive_Bayes_Best_Model.pkl", ) # Step 1: Read the CSV file data = pd.read_csv(csv_path) # Step 2: Filter necessary columns and set 'date' as the index data["date"] = pd.to_datetime(data["date"]) # Step 3: Add random time to the date column to ensure uniqueness def add_random_time(row): """Adds a random time offset to a date to ensure unique timestamps.""" random_time = timedelta( hours=random.randint(0, 23), minutes=random.randint(0, 59), seconds=random.randint(0, 59), ) return row + random_time # Apply the function to the date column data["date"] = data["date"].apply(add_random_time) # Step 4: Set 'date' as the index again after modification data.set_index("date", inplace=True) # Step 5: Load the pre-trained model with open(model_path, "rb") as file: model = pickle.load(file) # Step 6: Vectorize the cleaned text data using TF-IDF vectorizer = TfidfVectorizer(max_features=5000) X_transformed = vectorizer.fit_transform(data["cleaned_text"]).toarray() # Step 7: Use the model to predict 'cleaned_text' data["predicted_target"] = model.predict(X_transformed) # Step 8: Retain only 'Predicted_target' and 'target_encoded' columns for InfluxDB data = data[["predicted_target", "target_encoded"]] # Keep only necessary columns data.rename( columns={"target_encoded": "actual_target"}, inplace=True ) # Rename for consistency # Print 'actual_target' and 'Predicted_target' before sending to InfluxDB print("Actual Target and Predicted Target values:") print(data[["actual_target", "predicted_target"]]) # Display the count of 0 and 1 in 'actual_target' and 'predicted_target' columns print("Count of actual_target values (0 and 1):") print(data["actual_target"].value_counts()) print("Count of predicted_target values (0 and 1):") print(data["predicted_target"].value_counts()) # Display the shape of the DataFrame print("Shape of the DataFrame:", data.shape) print(data.head()) # Step 9: Send the DataFrame to InfluxDB with InfluxDBClient(url=url, token=token, org=org) as client: write_api = client.write_api(write_options=SYNCHRONOUS) # Write DataFrame to InfluxDB write_api.write( bucket=bucket, org=org, record=data, data_frame_measurement_name="textual_analysis", data_frame_field_columns=[ "predicted_target", "actual_target", ], # Fields for InfluxDB ) print("Data successfully sent to InfluxDB.")