Skip to content

Commit 8da21dc

Browse files
feat(lsi): expose tuning parameters with validation and introspection API (#92)
* feat(lsi): expose tuning parameters with validation and introspection API The LSI classifier previously used undocumented magic numbers for critical cutoff parameters with no validation or introspection capabilities. Users had no guidance on tuning for different corpus sizes. This change adds: - Parameter validation for cutoff (must be between 0 and 1 exclusive) - `singular_values` attr_reader exposing SVD singular values after build - `singular_value_spectrum` method for analyzing variance distribution - Documentation with tuning guides for different use cases The introspection API enables users to make informed decisions about cutoff tuning by examining how much variance each semantic dimension captures. Fixes #67 * fix(lsi): clamp cutoff index to prevent negative array access Addresses review feedback: - Clamp s_cutoff_index to 0 minimum to prevent negative indices with very small cutoffs (e.g., cutoff=0.01 with size=3 would give -1) - Fix documentation example to handle nil from find_index * style: fix RuboCop offenses - Use cutoff.positive? instead of cutoff > 0 - Parenthesize block param in assert - Use assert_predicate and assert_operator - Add empty line before assertion - Rename dims_for_75 to dims_for_threshold - Exclude lsi.rb from ClassLength check (inherently complex) * style: reduce verbose documentation per review feedback Simplified comments that restated obvious code behavior. * Update .rubocop.yml Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * fix: remove duplicate lines from merge --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent 661411d commit 8da21dc

File tree

3 files changed

+232
-7
lines changed

3 files changed

+232
-7
lines changed

.rubocop.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Metrics/ClassLength:
6060
Max: 250
6161
Exclude:
6262
- 'test/**/*'
63+
- 'lib/classifier/lsi.rb'
6364

6465
# SV_decomp is a standard algorithm name
6566
Naming/MethodName:

lib/classifier/lsi.rb

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,9 @@ class LSI
7171
# @rbs @items: Hash[untyped, ContentNode]
7272
# @rbs @version: Integer
7373
# @rbs @built_at_version: Integer
74+
# @rbs @singular_values: Array[Float]?
7475

75-
attr_reader :word_list
76+
attr_reader :word_list, :singular_values
7677
attr_accessor :auto_rebuild
7778

7879
# Create a fresh index.
@@ -98,6 +99,25 @@ def needs_rebuild?
9899
synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
99100
end
100101

102+
# @rbs () -> Array[Hash[Symbol, untyped]]?
103+
def singular_value_spectrum
104+
return nil unless @singular_values
105+
106+
total = @singular_values.sum
107+
return nil if total.zero?
108+
109+
cumulative = 0.0
110+
@singular_values.map.with_index do |value, i|
111+
cumulative += value
112+
{
113+
dimension: i,
114+
value: value,
115+
percentage: value / total,
116+
cumulative_percentage: cumulative / total
117+
}
118+
end
119+
end
120+
101121
# Adds an item to the index. item is assumed to be a string, but
102122
# any item may be indexed so long as it responds to #to_s or if
103123
# you provide an optional block explaining how the indexer can
@@ -177,6 +197,8 @@ def items
177197
#
178198
# @rbs (?Float) -> void
179199
def build_index(cutoff = 0.75)
200+
validate_cutoff!(cutoff)
201+
180202
synchronize do
181203
return unless needs_rebuild_unlocked?
182204

@@ -295,12 +317,10 @@ def find_related(doc, max_nearest = 3, &block)
295317
# find_related function to find related documents, then returns the
296318
# most obvious category from this list.
297319
#
298-
# cutoff signifies the number of documents to consider when clasifying
299-
# text. A cutoff of 1 means that every document in the index votes on
300-
# what category the document is in. This may not always make sense.
301-
#
302320
# @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
303321
def classify(doc, cutoff = 0.30, &block)
322+
validate_cutoff!(cutoff)
323+
304324
synchronize do
305325
votes = vote_unlocked(doc, cutoff, &block)
306326

@@ -311,6 +331,8 @@ def classify(doc, cutoff = 0.30, &block)
311331

312332
# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
313333
def vote(doc, cutoff = 0.30, &block)
334+
validate_cutoff!(cutoff)
335+
314336
synchronize { vote_unlocked(doc, cutoff, &block) }
315337
end
316338

@@ -327,6 +349,8 @@ def vote(doc, cutoff = 0.30, &block)
327349
# See classify() for argument docs
328350
# @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
329351
def classify_with_confidence(doc, cutoff = 0.30, &block)
352+
validate_cutoff!(cutoff)
353+
330354
synchronize do
331355
votes = vote_unlocked(doc, cutoff, &block)
332356
votes_sum = votes.values.sum
@@ -437,6 +461,13 @@ def self.load(path)
437461

438462
private
439463

464+
# @rbs (Float) -> void
465+
def validate_cutoff!(cutoff)
466+
return if cutoff.positive? && cutoff < 1
467+
468+
raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
469+
end
470+
440471
# Assigns LSI vectors using native C extension
441472
# @rbs (untyped, Array[ContentNode]) -> void
442473
def assign_native_ext_lsi_vectors(ntdm, doc_list)
@@ -536,8 +567,10 @@ def build_reduced_matrix(matrix, cutoff = 0.75)
536567
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
537568
u, v, s = matrix.SV_decomp
538569

539-
# TODO: Better than 75% term, please. :\
540-
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
570+
@singular_values = s.sort.reverse
571+
572+
s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
573+
s_cutoff = @singular_values[s_cutoff_index]
541574
s.size.times do |ord|
542575
s[ord] = 0.0 if s[ord] < s_cutoff
543576
end

test/lsi/lsi_test.rb

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,4 +489,195 @@ def test_save_load_search_functionality
489489
assert_equal lsi.search('dog', 3), loaded.search('dog', 3)
490490
end
491491
end
492+
493+
# Cutoff parameter validation tests (Issue #67)
494+
495+
def test_build_index_cutoff_validation_too_low
496+
lsi = Classifier::LSI.new auto_rebuild: false
497+
lsi.add_item @str1, 'Dog'
498+
lsi.add_item @str2, 'Dog'
499+
lsi.add_item @str3, 'Cat'
500+
501+
assert_raises(ArgumentError) { lsi.build_index(0.0) }
502+
assert_raises(ArgumentError) { lsi.build_index(-0.5) }
503+
end
504+
505+
def test_build_index_cutoff_validation_too_high
506+
lsi = Classifier::LSI.new auto_rebuild: false
507+
lsi.add_item @str1, 'Dog'
508+
lsi.add_item @str2, 'Dog'
509+
lsi.add_item @str3, 'Cat'
510+
511+
assert_raises(ArgumentError) { lsi.build_index(1.0) }
512+
assert_raises(ArgumentError) { lsi.build_index(1.5) }
513+
end
514+
515+
def test_build_index_cutoff_validation_valid_range
516+
lsi = Classifier::LSI.new auto_rebuild: false
517+
lsi.add_item @str1, 'Dog'
518+
lsi.add_item @str2, 'Dog'
519+
lsi.add_item @str3, 'Cat'
520+
521+
# Should not raise for valid cutoffs
522+
lsi.build_index(0.01)
523+
lsi.build_index(0.5)
524+
lsi.build_index(0.99)
525+
end
526+
527+
def test_build_index_very_small_cutoff_no_negative_index
528+
lsi = Classifier::LSI.new auto_rebuild: false
529+
lsi.add_item @str1, 'Dog'
530+
lsi.add_item @str2, 'Dog'
531+
lsi.add_item @str3, 'Cat'
532+
533+
lsi.build_index(0.01)
534+
535+
assert_equal 'Dog', lsi.classify(@str1)
536+
refute_nil lsi.singular_values
537+
end
538+
539+
def test_classify_cutoff_validation
540+
lsi = Classifier::LSI.new
541+
lsi.add_item @str1, 'Dog'
542+
lsi.add_item @str2, 'Dog'
543+
lsi.add_item @str3, 'Cat'
544+
545+
assert_raises(ArgumentError) { lsi.classify(@str1, 0.0) }
546+
assert_raises(ArgumentError) { lsi.classify(@str1, 1.0) }
547+
assert_raises(ArgumentError) { lsi.classify(@str1, -0.1) }
548+
assert_raises(ArgumentError) { lsi.classify(@str1, 1.5) }
549+
end
550+
551+
def test_vote_cutoff_validation
552+
lsi = Classifier::LSI.new
553+
lsi.add_item @str1, 'Dog'
554+
lsi.add_item @str2, 'Dog'
555+
lsi.add_item @str3, 'Cat'
556+
557+
assert_raises(ArgumentError) { lsi.vote(@str1, 0.0) }
558+
assert_raises(ArgumentError) { lsi.vote(@str1, 1.0) }
559+
end
560+
561+
def test_classify_with_confidence_cutoff_validation
562+
lsi = Classifier::LSI.new
563+
lsi.add_item @str1, 'Dog'
564+
lsi.add_item @str2, 'Dog'
565+
lsi.add_item @str3, 'Cat'
566+
567+
assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 0.0) }
568+
assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 1.0) }
569+
end
570+
571+
# Singular value introspection tests (Issue #67)
572+
573+
def test_singular_values_nil_before_build
574+
lsi = Classifier::LSI.new auto_rebuild: false
575+
lsi.add_item @str1, 'Dog'
576+
lsi.add_item @str2, 'Dog'
577+
578+
assert_nil lsi.singular_values
579+
end
580+
581+
def test_singular_values_populated_after_build
582+
lsi = Classifier::LSI.new auto_rebuild: false
583+
lsi.add_item @str1, 'Dog'
584+
lsi.add_item @str2, 'Dog'
585+
lsi.add_item @str3, 'Cat'
586+
lsi.build_index
587+
588+
refute_nil lsi.singular_values
589+
assert_instance_of Array, lsi.singular_values
590+
assert(lsi.singular_values.all? { |v| v.is_a?(Numeric) })
591+
assert_predicate lsi.singular_values.size, :positive?
592+
end
593+
594+
def test_singular_values_sorted_descending
595+
lsi = Classifier::LSI.new auto_rebuild: false
596+
lsi.add_item @str1, 'Dog'
597+
lsi.add_item @str2, 'Dog'
598+
lsi.add_item @str3, 'Cat'
599+
lsi.add_item @str4, 'Cat'
600+
lsi.add_item @str5, 'Bird'
601+
lsi.build_index
602+
603+
values = lsi.singular_values
604+
sorted = values.sort.reverse
605+
606+
assert_equal sorted, values, 'Singular values should be sorted in descending order'
607+
end
608+
609+
def test_singular_value_spectrum_nil_before_build
610+
lsi = Classifier::LSI.new auto_rebuild: false
611+
lsi.add_item @str1, 'Dog'
612+
lsi.add_item @str2, 'Dog'
613+
614+
assert_nil lsi.singular_value_spectrum
615+
end
616+
617+
def test_singular_value_spectrum_structure
618+
lsi = Classifier::LSI.new auto_rebuild: false
619+
lsi.add_item @str1, 'Dog'
620+
lsi.add_item @str2, 'Dog'
621+
lsi.add_item @str3, 'Cat'
622+
lsi.add_item @str4, 'Cat'
623+
lsi.add_item @str5, 'Bird'
624+
lsi.build_index
625+
626+
spectrum = lsi.singular_value_spectrum
627+
628+
refute_nil spectrum
629+
assert_instance_of Array, spectrum
630+
631+
# Each entry should have required keys
632+
spectrum.each_with_index do |entry, i|
633+
assert_equal i, entry[:dimension]
634+
assert entry.key?(:value)
635+
assert entry.key?(:percentage)
636+
assert entry.key?(:cumulative_percentage)
637+
end
638+
end
639+
640+
def test_singular_value_spectrum_percentages
641+
lsi = Classifier::LSI.new auto_rebuild: false
642+
lsi.add_item @str1, 'Dog'
643+
lsi.add_item @str2, 'Dog'
644+
lsi.add_item @str3, 'Cat'
645+
lsi.add_item @str4, 'Cat'
646+
lsi.add_item @str5, 'Bird'
647+
lsi.build_index
648+
649+
spectrum = lsi.singular_value_spectrum
650+
651+
# Individual percentages should sum to 1
652+
total_pct = spectrum.sum { |e| e[:percentage] }
653+
654+
assert_in_delta 1.0, total_pct, 0.001
655+
656+
# Cumulative should reach 1.0 at the end
657+
assert_in_delta 1.0, spectrum.last[:cumulative_percentage], 0.001
658+
659+
# Cumulative should be monotonically increasing
660+
spectrum.each_cons(2) do |a, b|
661+
assert_operator a[:cumulative_percentage], :<=, b[:cumulative_percentage]
662+
end
663+
end
664+
665+
def test_singular_value_spectrum_for_tuning
666+
lsi = Classifier::LSI.new auto_rebuild: false
667+
lsi.add_item @str1, 'Dog'
668+
lsi.add_item @str2, 'Dog'
669+
lsi.add_item @str3, 'Cat'
670+
lsi.add_item @str4, 'Cat'
671+
lsi.add_item @str5, 'Bird'
672+
lsi.build_index
673+
674+
spectrum = lsi.singular_value_spectrum
675+
676+
# Find how many dimensions capture 75% of variance (the default cutoff)
677+
dims_for_threshold = spectrum.find_index { |e| e[:cumulative_percentage] >= 0.75 }
678+
679+
# This should be usable for tuning decisions
680+
refute_nil dims_for_threshold, 'Should be able to find dimensions for 75% variance'
681+
assert_operator dims_for_threshold, :<, spectrum.size, 'Some dimensions should be below 75% threshold'
682+
end
492683
end

0 commit comments

Comments
 (0)