fix: implement proper Laplace smoothing in Bayes classifier

cardmagic · cardmagic · commit da3148ed5fda · 2025-12-27T01:55:55.000-08:00
Replace magic number 0.1 with proper add-one (Laplace) smoothing: P(word|category) = (count + 1) / (total + vocab_size) This ensures smoothing scales correctly with vocabulary size and applies consistently to both seen and unseen words. Fixes #64
diff --git a/lib/classifier/bayes.rb b/lib/classifier/bayes.rb
@@ -64,20 +64,25 @@ def untrain(category, text)
     #    b.classifications "I hate bad words and you"
     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
-    def classifications(text)
+    def classifications(text) # rubocop:disable Metrics/AbcSize
       score = {}
       word_hash = text.word_hash
-      training_count = @category_counts.values.inject { |x, y| x + y }.to_f
+      training_count = @category_counts.values.sum.to_f
+      vocab_size = [@categories.values.flat_map(&:keys).uniq.size, 1].max
+
       @categories.each do |category, category_words|
         score[category.to_s] = 0
-        total = (@category_word_count[category] || 1).to_f
+        total = @category_word_count[category] || 0
+        smoothed_total = (total + vocab_size).to_f
+
+        # Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
         word_hash.each_key do |word|
-          s = category_words.key?(word) ? category_words[word] : 0.1
-          score[category.to_s] += Math.log(s / total)
+          count = category_words[word] || 0
+          score[category.to_s] += Math.log((count + 1) / smoothed_total)
         end
-        # now add prior probability for the category
-        s = @category_counts.key?(category) ? @category_counts[category] : 0.1
-        score[category.to_s] += Math.log(s / training_count)
+
+        prior = @category_counts[category] || 0.1
+        score[category.to_s] += Math.log(prior / training_count)
       end
       score
     end
diff --git a/test/bayes/bayesian_test.rb b/test/bayes/bayesian_test.rb
@@ -337,4 +337,102 @@ def test_numbers_in_text
 
     assert_equal 'Interesting', result, 'Should handle numbers in text'
   end
+
+  # Laplace smoothing tests
+
+  def test_laplace_smoothing_unseen_words
+    # Train with some words, then classify with unseen word
+    # Laplace smoothing should give unseen words a non-zero probability
+    # that scales with vocabulary size
+    @classifier.train_interesting 'apple banana cherry'
+    @classifier.train_uninteresting 'dog elephant fox'
+
+    # "zebra" is unseen - should still get valid scores
+    scores = @classifier.classifications('zebra')
+
+    scores.each_value do |score|
+      assert_predicate score, :finite?, 'Score should be finite with Laplace smoothing'
+      refute_predicate score, :zero?, 'Score should be non-zero with Laplace smoothing'
+    end
+  end
+
+  def test_laplace_smoothing_consistency
+    # With proper Laplace smoothing, the probability of an unseen word
+    # should be α / (total + α * vocab_size)
+    # This should be consistent across categories with same training size
+    classifier = Classifier::Bayes.new 'A', 'B'
+    classifier.train_a 'word1 word2 word3'
+    classifier.train_b 'word4 word5 word6'
+
+    scores = classifier.classifications('unseenword')
+
+    # Both categories have same word count, so unseen word scores should be equal
+    assert_in_delta scores['A'], scores['B'], 0.01,
+                    'Equal-sized categories should give equal scores for unseen words'
+  end
+
+  def test_laplace_smoothing_vocabulary_scaling
+    # The smoothing should account for vocabulary size
+    # Larger vocabulary = smaller probability for each unseen word
+    small_vocab = Classifier::Bayes.new 'Cat', 'Dog'
+    small_vocab.train_cat 'meow purr'
+    small_vocab.train_dog 'bark woof'
+
+    large_vocab = Classifier::Bayes.new 'Cat', 'Dog'
+    large_vocab.train_cat 'meow purr hiss scratch climb jump pounce stalk hunt sleep'
+    large_vocab.train_dog 'bark woof growl fetch run play chase guard protect howl'
+
+    small_scores = small_vocab.classifications('unknown')
+    large_scores = large_vocab.classifications('unknown')
+
+    # With proper smoothing, larger vocabulary should give lower (more negative) scores
+    # for unseen words because probability mass is spread across more terms
+    assert_operator small_scores['Cat'], :>, large_scores['Cat'],
+                    'Larger vocabulary should give lower scores for unseen words'
+  end
+
+  def test_laplace_smoothing_seen_words_also_smoothed
+    # Proper Laplace smoothing applies to ALL words, not just unseen ones
+    # P(word|cat) = (count + α) / (total + α * V), not count / total
+    classifier = Classifier::Bayes.new 'A', 'B'
+    classifier.train_a 'test'
+    classifier.train_b 'other'
+
+    # With proper smoothing, seen word probability should include α adjustment
+    # The word "test" appears once in A with total=1, vocab=2
+    # Proper: (1 + 1) / (1 + 1*2) = 2/3
+    # Current: 1 / 1 = 1.0 (no smoothing applied to seen words)
+
+    scores = classifier.classifications('test')
+
+    # Score for A should reflect smoothed probability, not raw count
+    # log(2/3) ≈ -0.405, not log(1) = 0
+    # The word score plus prior should not equal just the prior
+    prior_only_score = Math.log(0.5) # equal priors
+
+    refute_in_delta scores['A'], prior_only_score, 0.01,
+                    'Seen word score should include smoothing adjustment, not raw probability'
+  end
+
+  def test_laplace_smoothing_denominator_includes_vocabulary
+    # The denominator should be (total + α * vocab_size), not just total
+    # This test verifies that adding more vocabulary affects all probabilities
+    classifier1 = Classifier::Bayes.new 'Spam', 'Ham'
+    classifier1.train_spam 'buy now'
+    classifier1.train_ham 'hello friend'
+
+    classifier2 = Classifier::Bayes.new 'Spam', 'Ham'
+    classifier2.train_spam 'buy now'
+    classifier2.train_ham 'hello friend goodbye see you later take care'
+
+    # Same query word "buy" - should have different probabilities
+    # because vocabulary size differs (affecting denominator)
+    scores1 = classifier1.classifications('buy')
+    scores2 = classifier2.classifications('buy')
+
+    # With proper smoothing, larger vocab in classifier2 means
+    # the probability of "buy" in Spam is lower (spread across more terms)
+    refute_in_delta scores1['Spam'], scores2['Spam'], 0.1,
+                    'Vocabulary size should affect word probabilities in denominator'
+  end
 end
diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
@@ -60,13 +60,11 @@ def test_external_classifying
     lsi.add_item @str5, 'Bird'
     bayes.train_bird @str5
 
-    # We're talking about dogs. Even though the text matches the corpus on
-    # cats better.  Dogs have more semantic weight than cats. So bayes
-    # will fail here, but the LSI recognizes content.
+    # Both classifiers should recognize this is about dogs
     tricky_case = 'This text revolves around dogs.'
 
     assert_equal 'Dog', lsi.classify(tricky_case)
-    assert_equal 'Cat', bayes.classify(tricky_case)
+    assert_equal 'Dog', bayes.classify(tricky_case)
   end
 
   def test_recategorize_interface