Devacron.com

Building a Naive Bayes Spam Filter with Testing and Updating Capabilities

Naive Bayes Spam Filter

In this tutorial, we’ll implement a Naive Bayes spam filter that not only classifies emails but also includes testing functionality and the ability to update its knowledge base with new data.

Overview

Our implementation will include:

Implementation

Here’s the complete implementation of our advanced Naive Bayes classifier:

class NaiveBayesClassifier:
    def __init__(self):
        self.spam_words = {}
        self.ham_words = {}
        self.spam_count = 0
        self.ham_count = 0

    def tokenize(self, text):
        """Converts text to lowercase and splits into words."""
        return text.lower().split()

    def train(self, spam_messages, ham_messages):
        """Trains classifier on initial dataset."""
        self.spam_count = len(spam_messages)
        self.ham_count = len(ham_messages)

        # Process spam messages
        for message in spam_messages:
            words = self.tokenize(message)
            for word in words:
                self.spam_words[word] = self.spam_words.get(word, 0) + 1

        # Process ham messages
        for message in ham_messages:
            words = self.tokenize(message)
            for word in words:
                self.ham_words[word] = self.ham_words.get(word, 0) + 1

    def calculate_probability(self, word, word_counts, message_count, vocabulary_size):
        """Calculates word probability with Laplace smoothing."""
        return (word_counts.get(word, 0) + 1) / (message_count + vocabulary_size)

    def classify(self, message, threshold=0.5):
        """Classifies a message as spam or ham."""
        words = self.tokenize(message)

        # Get complete vocabulary
        vocabulary = set(self.spam_words.keys()) | set(self.ham_words.keys())
        vocabulary_size = len(vocabulary)

        # Calculate prior probabilities
        total_messages = self.spam_count + self.ham_count
        spam_prior = self.spam_count / total_messages
        ham_prior = self.ham_count / total_messages

        # Initialize with priors
        spam_prob = spam_prior
        ham_prob = ham_prior

        # Calculate likelihood
        for word in words:
            spam_prob *= self.calculate_probability(word, self.spam_words, 
                                                  self.spam_count, vocabulary_size)
            ham_prob *= self.calculate_probability(word, self.ham_words, 
                                                 self.ham_count, vocabulary_size)

        # Calculate spamicity
        total_prob = spam_prob + ham_prob
        spamicity = spam_prob / total_prob if total_prob != 0 else 0

        return "spam" if spamicity > threshold else "ham", spamicity

    def update(self, message, label):
        """Updates classifier with new labeled data."""
        words = self.tokenize(message)
        if label == "spam":
            self.spam_count += 1
            for word in words:
                self.spam_words[word] = self.spam_words.get(word, 0) + 1
        elif label == "ham":
            self.ham_count += 1
            for word in words:
                self.ham_words[word] = self.ham_words.get(word, 0) + 1

Using the Classifier

Here’s how to use the classifier with a real example:

# Training data
previous_spam = [
    'send us your password',
    'review our website',
    'send your password',
    'send us your account'
]
previous_ham = [
    'Your activity report',
    'benefits physical activity',
    'the importance vows'
]

# Test data
new_emails = {
    'spam': ['renew your password', 'renew your vows'],
    'ham': ['benefits of our account', 'the importance of physical activity']
}

# Create and train classifier
classifier = NaiveBayesClassifier()
classifier.train(previous_spam, previous_ham)

# Set spam threshold
SPAM_THRESHOLD = 0.6

# Test the classifier
def test_classifier(classifier, test_data, threshold):
    correct = 0
    total = 0

    for true_label, messages in test_data.items():
        for message in messages:
            prediction, spamicity = classifier.classify(message, threshold)
            total += 1
            if prediction == true_label:
                correct += 1

            print(f"Message: '{message}'")
            print(f"Spamicity: {spamicity:.4f}")
            print(f"Prediction: {prediction}")
            print(f"True label: {true_label}\n")

    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy: {accuracy:.2%}")
    return accuracy

Handling Model Updates

One of the key features of our implementation is the ability to update the model with new data:

def update_classifier(classifier, new_data):
    """Updates classifier with new labeled data."""
    for label, messages in new_data.items():
        for message in messages:
            classifier.update(message, label)
            print(f"Updated with message: '{message}' as {label}")

# Test accuracy before updates
print("Initial accuracy:")
initial_accuracy = test_classifier(classifier, new_emails, SPAM_THRESHOLD)

# Update model with new data
print("\nUpdating classifier...")
update_classifier(classifier, new_emails)

# Test accuracy after updates
print("\nAccuracy after updates:")
final_accuracy = test_classifier(classifier, new_emails, SPAM_THRESHOLD)

Key Features and Improvements

  1. Laplace Smoothing
  1. Adjustable Threshold
  1. Comprehensive Testing
  1. Dynamic Updates

Performance Considerations

  1. Memory Efficiency
  1. Computational Efficiency
  1. Scalability

Potential Improvements

  1. Better Tokenization
def improved_tokenize(self, text):
    """Enhanced tokenization with preprocessing."""
    import re
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Split into words
    words = text.split()
    # Remove short words
    return [w for w in words if len(w) > 2]
  1. Feature Weighting
def calculate_word_weight(self, word):
    """Calculate importance weight for each word."""
    spam_freq = self.spam_words.get(word, 0) / max(self.spam_count, 1)
    ham_freq = self.ham_words.get(word, 0) / max(self.ham_count, 1)
    return abs(spam_freq - ham_freq)
  1. Advanced Probability Calculations
def calculate_log_probability(self, word, is_spam):
    """Use log probabilities to prevent underflow."""
    import math
    if is_spam:
        prob = self.calculate_probability(word, self.spam_words, 
                                        self.spam_count, self.vocabulary_size)
    else:
        prob = self.calculate_probability(word, self.ham_words, 
                                        self.ham_count, self.vocabulary_size)
    return math.log(prob) if prob > 0 else float('-inf')

Conclusion

This implementation provides a robust foundation for spam detection that can be extended and improved based on specific needs. The ability to update the model with new data makes it particularly valuable for real-world applications where spam patterns evolve over time.

Key takeaways:

Remember that while this implementation is good for learning and small-scale applications, production systems might need additional features like:

Exit mobile version