Skip to content

Commit

Permalink
Added new functions for detection and removal of personal information.
Browse files Browse the repository at this point in the history
  • Loading branch information
Infinitode committed Apr 13, 2024
1 parent 899829a commit 4509558
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 48 deletions.
24 changes: 22 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
![License Compliance](https://img.shields.io/badge/license-compliance-brightgreen.svg)
![PyPI Version](https://img.shields.io/pypi/v/valx)

An open-source Python library for data cleaning tasks. Includes profanity detection, and removal.
An open-source Python library for data cleaning tasks. Includes profanity detection, and removal. Also now includes personal information detection and removal.

## Installation

Expand All @@ -32,6 +32,8 @@ Please ensure that you have one of these Python versions installed before using

- **Profanity Detection**: Detect profane and NSFW words or terms.
- **Remove Profanity**: Remove profane and NSFW words or terms.
- **Detect Sensitive Information**: Detect sensitive information in text data.
- **Remove Sensitive Information**: Remove sensitive information from text data.

## Usage

Expand All @@ -53,13 +55,31 @@ from valx import remove_profanity
removed = remove_profanity(sample_text, "text_cleaned.txt", language="English")
```

### Detect Sensitive Information

```python
from valx import detect_sensitive_information

# Detect sensitive information
detected_sensitive_info = detect_sensitive_information(sample_text)
```

### Remove Sensitive Information

```python
from valx import remove_sensitive_information

# Remove sensitive information
cleaned_text = remove_sensitive_information(sample_text2)
```

## Contributing

Contributions are welcome! If you encounter any issues, have suggestions, or want to contribute to ValX, please open an issue or submit a pull request on [GitHub](https://github.com/infinitode/valx).

## License

PyWebScrapr is released under the terms of the **MIT License (Modified)**. Please see the [LICENSE](https://github.com/infinitode/pywebscrapr/blob/main/LICENSE) file for the full text.
ValX is released under the terms of the **MIT License (Modified)**. Please see the [LICENSE](https://github.com/infinitode/valx/blob/main/LICENSE) file for the full text.

### Derived licenses
---
Expand Down
24 changes: 23 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from valx import load_profanity_words, detect_profanity, remove_profanity
from valx import load_profanity_words, detect_profanity, remove_profanity, detect_sensitive_information, remove_sensitive_information

def main():
sample_text = [
Expand All @@ -14,5 +14,27 @@ def main():

removed = remove_profanity(sample_text, "text_cleaned.txt", language="All")


# Example usage:
sample_text2 = [
"Please contact john.doe@example.com or call 555-123-4567 for more information.",
"We will need your credit card number to complete the transaction: 1234-5678-9012-3456.",
"My social security number is 123-45-6789 and my ID number is AB123456.",
"Our office address is 123 Main St, Anytown, USA. Please visit us!",
"Your IP address is 192.168.1.1. Please don't share it with anyone."
]

# Detect sensitive information
detected_sensitive_info = detect_sensitive_information(sample_text2)
print("Detected sensitive information:")
for line_num, col_num, info_type, info in detected_sensitive_info:
print(f"Line {line_num}, Column {col_num}: {info_type} - {info}")

# Remove sensitive information
cleaned_text = remove_sensitive_information(sample_text2)
print("\nCleaned text:")
for line in cleaned_text:
print(line)

if __name__ == "__main__":
main()
138 changes: 93 additions & 45 deletions valx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,50 +10,6 @@
https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/LICENSE
'''


# def load_profanity_words(language='English'):
# """
# Load profanity words from the local text file.

# Args:
# language (str): The language for which to load profanity words. Defaults to 'English'.

# Returns:
# list: A list of profanity words for the specified language, or all languages if 'All' is specified.
# """
# file_path = os.path.join(os.path.dirname(__file__), 'profanity_words.txt')
# try:
# with open(file_path, 'r', encoding='utf-8') as file:
# profanity_lists = {}
# current_language = None
# current_words = []
# for line in file:
# line = line.strip()
# if line.startswith("$") and line.endswith("$"):
# if current_language:
# profanity_lists[current_language] = current_words
# current_words = []
# current_language = line.strip("$")
# else:
# current_words.append(line)
# if current_language:
# profanity_lists[current_language] = current_words

# if language == 'All':
# all_profanity_words = []
# for words in profanity_lists.values():
# all_profanity_words.extend(words)
# return all_profanity_words
# else:
# return profanity_lists.get(language, [])
# except FileNotFoundError:
# print("Failed to load profanity words file. Using default list.")
# return [
# 'profane_word1',
# 'profane_word2',
# 'profane_word3'
# ]

def load_profanity_words(language='English'):
"""
Load profanity words for the specified language.
Expand Down Expand Up @@ -2787,7 +2743,7 @@ def load_profanity_words(language='English'):
return profanity_lists.get(language, [])


def detect_profanity(text_data, language='English', allowed_languages=None):
def detect_profanity(text_data, language='English'):
"""
Detect profanity in text data using regex.
Expand Down Expand Up @@ -2853,3 +2809,95 @@ def remove_profanity(text_data, output_file=None, language='English'):
file.write('\n'.join(cleaned_data))

return cleaned_data

# New functions for 1.5
def detect_sensitive_information(text_data):
"""
Detect sensitive information patterns in the provided text data.
Args:
text_data (list of str): A list of strings representing the text data to be analyzed.
Returns:
list of tuple: A list of tuples containing detected sensitive information, each tuple
representing (line number, column index, type, value).
"""
sensitive_info = []

# Regular expression for detecting email addresses
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# Regular expression for detecting phone numbers
phone_regex = r'\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b'
# Regular expression for detecting credit card numbers (Visa, Mastercard, American Express, Discover)
credit_card_regex = r'\b(?:\d[ -]*?){13,16}\b'
# Regular expression for detecting social security numbers (SSN)
ssn_regex = r'\b\d{3}[-]?\d{2}[-]?\d{4}\b'
# Regular expression for detecting ID numbers (e.g., driver's license, passport)
id_number_regex = r'\b[A-Za-z]{1,2}\d{6,9}\b'
# Regular expression for detecting addresses (simple example)
address_regex = r'\b\d+\s\w+\s\w+,\s\w+,\s\w+\b'
# Regular expression for detecting IP addresses
ip_address_regex = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'

for i, line in enumerate(text_data):
# Detect email addresses
emails = re.findall(email_regex, line)
for email in emails:
sensitive_info.append((i + 1, line.find(email), 'Email', email))

# Detect phone numbers
phones = re.findall(phone_regex, line)
for phone in phones:
sensitive_info.append((i + 1, line.find(phone), 'Phone', phone))

# Detect credit card numbers
credit_cards = re.findall(credit_card_regex, line)
for card in credit_cards:
sensitive_info.append((i + 1, line.find(card), 'Credit Card', card))

# Detect social security numbers
ssn_numbers = re.findall(ssn_regex, line)
for ssn in ssn_numbers:
sensitive_info.append((i + 1, line.find(ssn), 'SSN', ssn))

# Detect ID numbers
id_numbers = re.findall(id_number_regex, line)
for id_number in id_numbers:
sensitive_info.append((i + 1, line.find(id_number), 'ID Number', id_number))

# Detect addresses
addresses = re.findall(address_regex, line)
for address in addresses:
sensitive_info.append((i + 1, line.find(address), 'Address', address))

# Detect IP addresses
ip_addresses = re.findall(ip_address_regex, line)
for ip_address in ip_addresses:
sensitive_info.append((i + 1, line.find(ip_address), 'IP Address', ip_address))

return sensitive_info

def remove_sensitive_information(text_data, output_file=None):
"""
Remove sensitive information patterns from the provided text data.
Args:
text_data (list of str): A list of strings representing the text data to be cleaned.
output_file (str, optional): Path to the output file where cleaned data will be saved.
Returns:
list of str: A list of strings representing the cleaned text data.
"""
sensitive_info = detect_sensitive_information(text_data)
cleaned_data = []
for line in text_data:
cleaned_line = line
for _, _, _, info in sensitive_info:
cleaned_line = cleaned_line.replace(info, '[SENSITIVE]')
cleaned_data.append(cleaned_line)

# Write cleaned data to output file if provided
if output_file:
with open(output_file, 'w', encoding='utf-8') as file:
file.write('\n'.join(cleaned_data))
return cleaned_data

0 comments on commit 4509558

Please sign in to comment.