@@ -337,4 +337,102 @@ def test_numbers_in_text
337337
338338 assert_equal 'Interesting' , result , 'Should handle numbers in text'
339339 end
340+
341+ # Laplace smoothing tests
342+
343+ def test_laplace_smoothing_unseen_words
344+ # Train with some words, then classify with unseen word
345+ # Laplace smoothing should give unseen words a non-zero probability
346+ # that scales with vocabulary size
347+ @classifier . train_interesting 'apple banana cherry'
348+ @classifier . train_uninteresting 'dog elephant fox'
349+
350+ # "zebra" is unseen - should still get valid scores
351+ scores = @classifier . classifications ( 'zebra' )
352+
353+ scores . each_value do |score |
354+ assert_predicate score , :finite? , 'Score should be finite with Laplace smoothing'
355+ refute_predicate score , :zero? , 'Score should be non-zero with Laplace smoothing'
356+ end
357+ end
358+
359+ def test_laplace_smoothing_consistency
360+ # With proper Laplace smoothing, the probability of an unseen word
361+ # should be α / (total + α * vocab_size)
362+ # This should be consistent across categories with same training size
363+ classifier = Classifier ::Bayes . new 'A' , 'B'
364+ classifier . train_a 'word1 word2 word3'
365+ classifier . train_b 'word4 word5 word6'
366+
367+ scores = classifier . classifications ( 'unseenword' )
368+
369+ # Both categories have same word count, so unseen word scores should be equal
370+ assert_in_delta scores [ 'A' ] , scores [ 'B' ] , 0.01 ,
371+ 'Equal-sized categories should give equal scores for unseen words'
372+ end
373+
374+ def test_laplace_smoothing_vocabulary_scaling
375+ # The smoothing should account for vocabulary size
376+ # Larger vocabulary = smaller probability for each unseen word
377+ small_vocab = Classifier ::Bayes . new 'Cat' , 'Dog'
378+ small_vocab . train_cat 'meow purr'
379+ small_vocab . train_dog 'bark woof'
380+
381+ large_vocab = Classifier ::Bayes . new 'Cat' , 'Dog'
382+ large_vocab . train_cat 'meow purr hiss scratch climb jump pounce stalk hunt sleep'
383+ large_vocab . train_dog 'bark woof growl fetch run play chase guard protect howl'
384+
385+ small_scores = small_vocab . classifications ( 'unknown' )
386+ large_scores = large_vocab . classifications ( 'unknown' )
387+
388+ # With proper smoothing, larger vocabulary should give lower (more negative) scores
389+ # for unseen words because probability mass is spread across more terms
390+ assert_operator small_scores [ 'Cat' ] , :> , large_scores [ 'Cat' ] ,
391+ 'Larger vocabulary should give lower scores for unseen words'
392+ end
393+
394+ def test_laplace_smoothing_seen_words_also_smoothed
395+ # Proper Laplace smoothing applies to ALL words, not just unseen ones
396+ # P(word|cat) = (count + α) / (total + α * V), not count / total
397+ classifier = Classifier ::Bayes . new 'A' , 'B'
398+ classifier . train_a 'test'
399+ classifier . train_b 'other'
400+
401+ # With proper smoothing, seen word probability should include α adjustment
402+ # The word "test" appears once in A with total=1, vocab=2
403+ # Proper: (1 + 1) / (1 + 1*2) = 2/3
404+ # Current: 1 / 1 = 1.0 (no smoothing applied to seen words)
405+
406+ scores = classifier . classifications ( 'test' )
407+
408+ # Score for A should reflect smoothed probability, not raw count
409+ # log(2/3) ≈ -0.405, not log(1) = 0
410+ # The word score plus prior should not equal just the prior
411+ prior_only_score = Math . log ( 0.5 ) # equal priors
412+
413+ refute_in_delta scores [ 'A' ] , prior_only_score , 0.01 ,
414+ 'Seen word score should include smoothing adjustment, not raw probability'
415+ end
416+
417+ def test_laplace_smoothing_denominator_includes_vocabulary
418+ # The denominator should be (total + α * vocab_size), not just total
419+ # This test verifies that adding more vocabulary affects all probabilities
420+ classifier1 = Classifier ::Bayes . new 'Spam' , 'Ham'
421+ classifier1 . train_spam 'buy now'
422+ classifier1 . train_ham 'hello friend'
423+
424+ classifier2 = Classifier ::Bayes . new 'Spam' , 'Ham'
425+ classifier2 . train_spam 'buy now'
426+ classifier2 . train_ham 'hello friend goodbye see you later take care'
427+
428+ # Same query word "buy" - should have different probabilities
429+ # because vocabulary size differs (affecting denominator)
430+ scores1 = classifier1 . classifications ( 'buy' )
431+ scores2 = classifier2 . classifications ( 'buy' )
432+
433+ # With proper smoothing, larger vocab in classifier2 means
434+ # the probability of "buy" in Spam is lower (spread across more terms)
435+ refute_in_delta scores1 [ 'Spam' ] , scores2 [ 'Spam' ] , 0.1 ,
436+ 'Vocabulary size should affect word probabilities in denominator'
437+ end
340438end
0 commit comments