Merge pull request #7 from BU-Spark/model_experimentation

Racist Deeds PoC
BU-Spark · Nov 7, 2024 · 76f5962 · 76f5962
2 parents e217de7 + e785aae
commit 76f5962
Show file tree

Hide file tree

Showing 819 changed files with 106,968 additions and 6 deletions.
diff --git a/modules/deed_preprocessing/eda.ipynb b/modules/deed_preprocessing/eda.ipynb
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -76,7 +76,9 @@
       "OCR result for 000594-0301.TIF saved to 000594-0301.txt\n",
       "OCR result for 000594-0401.TIF saved to 000594-0401.txt\n",
       "OCR result for 000594-0501.TIF saved to 000594-0501.txt\n",
-      "OCR processing complete. Text files are saved in: ./outputs\n"
+      "OCR result for 000547-0001.TIF saved to 000547-0001.txt\n",
+      "OCR result for 000557-0001.TIF saved to 000557-0001.txt\n",
+      "OCR processing complete. Text files are saved in: ./NonRacist_deeds\n"
      ]
     }
    ],
@@ -132,7 +134,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -220,7 +222,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -893,7 +895,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -907,7 +909,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.1"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,

diff --git a/modules/model_experimentation/bag_of_words_logistic_regression.py b/modules/model_experimentation/bag_of_words_logistic_regression.py
@@ -0,0 +1,73 @@
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+def preprocess_bag_of_words(preprocessed_text_list):
+    texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list]
+
+    vectorizer = CountVectorizer()
+    bag_of_words = vectorizer.fit_transform(texts)
+
+    bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
+
+    return bow_df, vectorizer
+
+if __name__ == "__main__":
+    preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')
+
+    texts = preprocessed_data['original_text']
+    preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist()
+
+    bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list)
+
+    X = bow_df
+    y = preprocessed_data['is_racist']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+
+    logistic_model = LogisticRegression(max_iter=1000)
+    logistic_model.fit(X_train, y_train)
+    y_pred = logistic_model.predict(X_test)
+
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Accuracy: {accuracy:.2f}")
+    print("\nClassification Report:")
+    print(classification_report(y_test, y_pred))
+
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    plt.figure(figsize=(6, 4))
+    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist'])
+    plt.title('Confusion Matrix')
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.show()
+
+    y_prob = logistic_model.predict_proba(X_test)[:, 1]
+    fpr, tpr, _ = roc_curve(y_test, y_prob)
+    roc_auc = auc(fpr, tpr)
+
+    plt.figure(figsize=(6, 4))
+    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
+    plt.plot([0, 1], [0, 1], 'k--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic (ROC) Curve')
+    plt.legend(loc="lower right")
+    plt.show()
+
+    feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out())
+    top_features = feature_importance.nlargest(10)
+
+    plt.figure(figsize=(8, 6))
+    top_features.plot(kind='barh', color='skyblue')
+    plt.title('Top 10 Most Influential Words for Racist Classification')
+    plt.xlabel('Coefficient Value')
+    plt.ylabel('Word')
+    plt.show()
diff --git a/modules/model_experimentation/create_df.py b/modules/model_experimentation/create_df.py
@@ -0,0 +1,50 @@
+import os
+import pandas as pd
+from pathlib import Path
+import sys
+sys.path.append('../deed_preprocessing')
+from preprocessor import preprocess_text
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+def preprocess_deeds():
+    sys.path.append('../deed_preprocessing')
+    from preprocessor import preprocess_text
+
+    racist_dir = Path('./racist_deeds_text')
+    non_racist_dir = Path('./non_racist_deeds_text')
+
+    all_data = pd.DataFrame()
+
+    racist_count = 0
+    non_racist_count = 0
+
+    def process_directory(directory, is_racist_label):
+        nonlocal all_data, racist_count, non_racist_count
+        for file in directory.iterdir():
+            if file.is_file() and file.suffix == '.txt':
+                with file.open('r', encoding='utf-8') as f:
+                    text = f.read()
+                    processed_text = preprocess_text(text)
+
+                    df = pd.DataFrame([processed_text])
+                    df['is_racist'] = is_racist_label
+
+                    all_data = pd.concat([all_data, df], ignore_index=True)
+
+                if is_racist_label == 1:
+                    racist_count += 1
+                else:
+                    non_racist_count += 1
+
+    process_directory(racist_dir, 1)
+    process_directory(non_racist_dir, 0)
+
+    print(f"Number of racist text files read: {racist_count}")
+    print(f"Number of non-racist text files read: {non_racist_count}")
+
+    return all_data
+
+if __name__ == "__main__":
+    preprocessed_data = preprocess_deeds()
+    preprocessed_data.to_pickle('preprocessed_deeds.pkl')
diff --git a/modules/model_experimentation/extract_names.py b/modules/model_experimentation/extract_names.py
@@ -0,0 +1,43 @@
+import os
+import pandas as pd
+import sys
+sys.path.append('../deed_preprocessing')
+from preprocessor import preprocess_text
+
+# Define the directory paths
+script_dir = os.path.dirname(os.path.abspath(__file__))
+racist_deeds_dir = os.path.join(script_dir, "racist_deeds_text")
+extracted_info_dir = os.path.join(script_dir, "extracted_info")
+output_file = os.path.join(extracted_info_dir, "extracted_names_locations.xlsx")
+
+# Ensure the extracted_info directory exists
+os.makedirs(extracted_info_dir, exist_ok=True)
+
+def extract_names_and_locations():
+    # Initialize a list to store data for each file
+    data = []
+
+    # Process each file
+    for file in os.listdir(racist_deeds_dir):
+        if file.endswith(".txt"):
+            with open(os.path.join(racist_deeds_dir, file), 'r', encoding='utf-8') as f:
+                text = f.read()
+                processed = preprocess_text(text)
+
+                # Extract names and locations
+                names = processed.get("names", [])
+                locations = processed.get("locations", [])
+
+                # Append the data for this file as a row in the list
+                data.append({
+                    "File Name": file,
+                    "Names": ", ".join(names),
+                    "Locations": ", ".join(locations)
+                })
+
+    # Convert the data into a DataFrame and save to Excel in the extracted_info directory
+    df = pd.DataFrame(data)
+    df.to_excel(output_file, index=False)
+
+# Run the function to save the output to an Excel file in the extracted_info directory
+extract_names_and_locations()