Skip to content

Commit

Permalink
Merge pull request #7 from BU-Spark/model_experimentation
Browse files Browse the repository at this point in the history
Racist Deeds PoC
  • Loading branch information
NathanielQuisel authored Nov 7, 2024
2 parents e217de7 + e785aae commit 76f5962
Show file tree
Hide file tree
Showing 819 changed files with 106,968 additions and 6 deletions.
14 changes: 8 additions & 6 deletions modules/deed_preprocessing/eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -76,7 +76,9 @@
"OCR result for 000594-0301.TIF saved to 000594-0301.txt\n",
"OCR result for 000594-0401.TIF saved to 000594-0401.txt\n",
"OCR result for 000594-0501.TIF saved to 000594-0501.txt\n",
"OCR processing complete. Text files are saved in: ./outputs\n"
"OCR result for 000547-0001.TIF saved to 000547-0001.txt\n",
"OCR result for 000557-0001.TIF saved to 000557-0001.txt\n",
"OCR processing complete. Text files are saved in: ./NonRacist_deeds\n"
]
}
],
Expand Down Expand Up @@ -132,7 +134,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -220,7 +222,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -893,7 +895,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -907,7 +909,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.1"
"version": "3.12.6"
}
},
"nbformat": 4,
Expand Down
73 changes: 73 additions & 0 deletions modules/model_experimentation/bag_of_words_logistic_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def preprocess_bag_of_words(preprocessed_text_list):
texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list]

vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(texts)

bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

return bow_df, vectorizer

if __name__ == "__main__":
preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')

texts = preprocessed_data['original_text']
preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist()

bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list)

X = bow_df
y = preprocessed_data['is_racist']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

y_prob = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out())
top_features = feature_importance.nlargest(10)

plt.figure(figsize=(8, 6))
top_features.plot(kind='barh', color='skyblue')
plt.title('Top 10 Most Influential Words for Racist Classification')
plt.xlabel('Coefficient Value')
plt.ylabel('Word')
plt.show()
50 changes: 50 additions & 0 deletions modules/model_experimentation/create_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import pandas as pd
from pathlib import Path
import sys
sys.path.append('../deed_preprocessing')
from preprocessor import preprocess_text
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def preprocess_deeds():
sys.path.append('../deed_preprocessing')
from preprocessor import preprocess_text

racist_dir = Path('./racist_deeds_text')
non_racist_dir = Path('./non_racist_deeds_text')

all_data = pd.DataFrame()

racist_count = 0
non_racist_count = 0

def process_directory(directory, is_racist_label):
nonlocal all_data, racist_count, non_racist_count
for file in directory.iterdir():
if file.is_file() and file.suffix == '.txt':
with file.open('r', encoding='utf-8') as f:
text = f.read()
processed_text = preprocess_text(text)

df = pd.DataFrame([processed_text])
df['is_racist'] = is_racist_label

all_data = pd.concat([all_data, df], ignore_index=True)

if is_racist_label == 1:
racist_count += 1
else:
non_racist_count += 1

process_directory(racist_dir, 1)
process_directory(non_racist_dir, 0)

print(f"Number of racist text files read: {racist_count}")
print(f"Number of non-racist text files read: {non_racist_count}")

return all_data

if __name__ == "__main__":
preprocessed_data = preprocess_deeds()
preprocessed_data.to_pickle('preprocessed_deeds.pkl')
43 changes: 43 additions & 0 deletions modules/model_experimentation/extract_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import pandas as pd
import sys
sys.path.append('../deed_preprocessing')
from preprocessor import preprocess_text

# Define the directory paths
script_dir = os.path.dirname(os.path.abspath(__file__))
racist_deeds_dir = os.path.join(script_dir, "racist_deeds_text")
extracted_info_dir = os.path.join(script_dir, "extracted_info")
output_file = os.path.join(extracted_info_dir, "extracted_names_locations.xlsx")

# Ensure the extracted_info directory exists
os.makedirs(extracted_info_dir, exist_ok=True)

def extract_names_and_locations():
# Initialize a list to store data for each file
data = []

# Process each file
for file in os.listdir(racist_deeds_dir):
if file.endswith(".txt"):
with open(os.path.join(racist_deeds_dir, file), 'r', encoding='utf-8') as f:
text = f.read()
processed = preprocess_text(text)

# Extract names and locations
names = processed.get("names", [])
locations = processed.get("locations", [])

# Append the data for this file as a row in the list
data.append({
"File Name": file,
"Names": ", ".join(names),
"Locations": ", ".join(locations)
})

# Convert the data into a DataFrame and save to Excel in the extracted_info directory
df = pd.DataFrame(data)
df.to_excel(output_file, index=False)

# Run the function to save the output to an Excel file in the extracted_info directory
extract_names_and_locations()
Loading

0 comments on commit 76f5962

Please sign in to comment.