Skip to content

Commit da3148e

Browse files
committed
fix: implement proper Laplace smoothing in Bayes classifier
Replace magic number 0.1 with proper add-one (Laplace) smoothing: P(word|category) = (count + 1) / (total + vocab_size) This ensures smoothing scales correctly with vocabulary size and applies consistently to both seen and unseen words. Fixes #64
1 parent 6e43186 commit da3148e

File tree

3 files changed

+113
-12
lines changed

3 files changed

+113
-12
lines changed

lib/classifier/bayes.rb

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,20 +64,25 @@ def untrain(category, text)
6464
# b.classifications "I hate bad words and you"
6565
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
6666
# The largest of these scores (the one closest to 0) is the one picked out by #classify
67-
def classifications(text)
67+
def classifications(text) # rubocop:disable Metrics/AbcSize
6868
score = {}
6969
word_hash = text.word_hash
70-
training_count = @category_counts.values.inject { |x, y| x + y }.to_f
70+
training_count = @category_counts.values.sum.to_f
71+
vocab_size = [@categories.values.flat_map(&:keys).uniq.size, 1].max
72+
7173
@categories.each do |category, category_words|
7274
score[category.to_s] = 0
73-
total = (@category_word_count[category] || 1).to_f
75+
total = @category_word_count[category] || 0
76+
smoothed_total = (total + vocab_size).to_f
77+
78+
# Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
7479
word_hash.each_key do |word|
75-
s = category_words.key?(word) ? category_words[word] : 0.1
76-
score[category.to_s] += Math.log(s / total)
80+
count = category_words[word] || 0
81+
score[category.to_s] += Math.log((count + 1) / smoothed_total)
7782
end
78-
# now add prior probability for the category
79-
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
80-
score[category.to_s] += Math.log(s / training_count)
83+
84+
prior = @category_counts[category] || 0.1
85+
score[category.to_s] += Math.log(prior / training_count)
8186
end
8287
score
8388
end

test/bayes/bayesian_test.rb

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,4 +337,102 @@ def test_numbers_in_text
337337

338338
assert_equal 'Interesting', result, 'Should handle numbers in text'
339339
end
340+
341+
# Laplace smoothing tests
342+
343+
def test_laplace_smoothing_unseen_words
344+
# Train with some words, then classify with unseen word
345+
# Laplace smoothing should give unseen words a non-zero probability
346+
# that scales with vocabulary size
347+
@classifier.train_interesting 'apple banana cherry'
348+
@classifier.train_uninteresting 'dog elephant fox'
349+
350+
# "zebra" is unseen - should still get valid scores
351+
scores = @classifier.classifications('zebra')
352+
353+
scores.each_value do |score|
354+
assert_predicate score, :finite?, 'Score should be finite with Laplace smoothing'
355+
refute_predicate score, :zero?, 'Score should be non-zero with Laplace smoothing'
356+
end
357+
end
358+
359+
def test_laplace_smoothing_consistency
360+
# With proper Laplace smoothing, the probability of an unseen word
361+
# should be α / (total + α * vocab_size)
362+
# This should be consistent across categories with same training size
363+
classifier = Classifier::Bayes.new 'A', 'B'
364+
classifier.train_a 'word1 word2 word3'
365+
classifier.train_b 'word4 word5 word6'
366+
367+
scores = classifier.classifications('unseenword')
368+
369+
# Both categories have same word count, so unseen word scores should be equal
370+
assert_in_delta scores['A'], scores['B'], 0.01,
371+
'Equal-sized categories should give equal scores for unseen words'
372+
end
373+
374+
def test_laplace_smoothing_vocabulary_scaling
375+
# The smoothing should account for vocabulary size
376+
# Larger vocabulary = smaller probability for each unseen word
377+
small_vocab = Classifier::Bayes.new 'Cat', 'Dog'
378+
small_vocab.train_cat 'meow purr'
379+
small_vocab.train_dog 'bark woof'
380+
381+
large_vocab = Classifier::Bayes.new 'Cat', 'Dog'
382+
large_vocab.train_cat 'meow purr hiss scratch climb jump pounce stalk hunt sleep'
383+
large_vocab.train_dog 'bark woof growl fetch run play chase guard protect howl'
384+
385+
small_scores = small_vocab.classifications('unknown')
386+
large_scores = large_vocab.classifications('unknown')
387+
388+
# With proper smoothing, larger vocabulary should give lower (more negative) scores
389+
# for unseen words because probability mass is spread across more terms
390+
assert_operator small_scores['Cat'], :>, large_scores['Cat'],
391+
'Larger vocabulary should give lower scores for unseen words'
392+
end
393+
394+
def test_laplace_smoothing_seen_words_also_smoothed
395+
# Proper Laplace smoothing applies to ALL words, not just unseen ones
396+
# P(word|cat) = (count + α) / (total + α * V), not count / total
397+
classifier = Classifier::Bayes.new 'A', 'B'
398+
classifier.train_a 'test'
399+
classifier.train_b 'other'
400+
401+
# With proper smoothing, seen word probability should include α adjustment
402+
# The word "test" appears once in A with total=1, vocab=2
403+
# Proper: (1 + 1) / (1 + 1*2) = 2/3
404+
# Current: 1 / 1 = 1.0 (no smoothing applied to seen words)
405+
406+
scores = classifier.classifications('test')
407+
408+
# Score for A should reflect smoothed probability, not raw count
409+
# log(2/3) ≈ -0.405, not log(1) = 0
410+
# The word score plus prior should not equal just the prior
411+
prior_only_score = Math.log(0.5) # equal priors
412+
413+
refute_in_delta scores['A'], prior_only_score, 0.01,
414+
'Seen word score should include smoothing adjustment, not raw probability'
415+
end
416+
417+
def test_laplace_smoothing_denominator_includes_vocabulary
418+
# The denominator should be (total + α * vocab_size), not just total
419+
# This test verifies that adding more vocabulary affects all probabilities
420+
classifier1 = Classifier::Bayes.new 'Spam', 'Ham'
421+
classifier1.train_spam 'buy now'
422+
classifier1.train_ham 'hello friend'
423+
424+
classifier2 = Classifier::Bayes.new 'Spam', 'Ham'
425+
classifier2.train_spam 'buy now'
426+
classifier2.train_ham 'hello friend goodbye see you later take care'
427+
428+
# Same query word "buy" - should have different probabilities
429+
# because vocabulary size differs (affecting denominator)
430+
scores1 = classifier1.classifications('buy')
431+
scores2 = classifier2.classifications('buy')
432+
433+
# With proper smoothing, larger vocab in classifier2 means
434+
# the probability of "buy" in Spam is lower (spread across more terms)
435+
refute_in_delta scores1['Spam'], scores2['Spam'], 0.1,
436+
'Vocabulary size should affect word probabilities in denominator'
437+
end
340438
end

test/lsi/lsi_test.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,11 @@ def test_external_classifying
6060
lsi.add_item @str5, 'Bird'
6161
bayes.train_bird @str5
6262

63-
# We're talking about dogs. Even though the text matches the corpus on
64-
# cats better. Dogs have more semantic weight than cats. So bayes
65-
# will fail here, but the LSI recognizes content.
63+
# Both classifiers should recognize this is about dogs
6664
tricky_case = 'This text revolves around dogs.'
6765

6866
assert_equal 'Dog', lsi.classify(tricky_case)
69-
assert_equal 'Cat', bayes.classify(tricky_case)
67+
assert_equal 'Dog', bayes.classify(tricky_case)
7068
end
7169

7270
def test_recategorize_interface

0 commit comments

Comments
 (0)