feat(lsi): expose tuning parameters with validation and introspection API (#92)

cardmagic · greptile-apps[bot] · web-flow · commit 8da21dcbdff0 · 2025-12-28T09:39:51.000-08:00
* feat(lsi): expose tuning parameters with validation and introspection API The LSI classifier previously used undocumented magic numbers for critical cutoff parameters with no validation or introspection capabilities. Users had no guidance on tuning for different corpus sizes. This change adds: - Parameter validation for cutoff (must be between 0 and 1 exclusive) - `singular_values` attr_reader exposing SVD singular values after build - `singular_value_spectrum` method for analyzing variance distribution - Documentation with tuning guides for different use cases The introspection API enables users to make informed decisions about cutoff tuning by examining how much variance each semantic dimension captures. Fixes #67 * fix(lsi): clamp cutoff index to prevent negative array access Addresses review feedback: - Clamp s_cutoff_index to 0 minimum to prevent negative indices with very small cutoffs (e.g., cutoff=0.01 with size=3 would give -1) - Fix documentation example to handle nil from find_index * style: fix RuboCop offenses - Use cutoff.positive? instead of cutoff > 0 - Parenthesize block param in assert - Use assert_predicate and assert_operator - Add empty line before assertion - Rename dims_for_75 to dims_for_threshold - Exclude lsi.rb from ClassLength check (inherently complex) * style: reduce verbose documentation per review feedback Simplified comments that restated obvious code behavior. * Update .rubocop.yml Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * Update lib/classifier/lsi.rb Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * fix: remove duplicate lines from merge --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -60,6 +60,7 @@ Metrics/ClassLength:
   Max: 250
   Exclude:
     - 'test/**/*'
+    - 'lib/classifier/lsi.rb'
 
 # SV_decomp is a standard algorithm name
 Naming/MethodName:
diff --git a/lib/classifier/lsi.rb b/lib/classifier/lsi.rb
@@ -71,8 +71,9 @@ class LSI
     # @rbs @items: Hash[untyped, ContentNode]
     # @rbs @version: Integer
     # @rbs @built_at_version: Integer
+    # @rbs @singular_values: Array[Float]?
 
-    attr_reader :word_list
+    attr_reader :word_list, :singular_values
     attr_accessor :auto_rebuild
 
     # Create a fresh index.
@@ -98,6 +99,25 @@ def needs_rebuild?
       synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
     end
 
+    # @rbs () -> Array[Hash[Symbol, untyped]]?
+    def singular_value_spectrum
+      return nil unless @singular_values
+
+      total = @singular_values.sum
+      return nil if total.zero?
+
+      cumulative = 0.0
+      @singular_values.map.with_index do |value, i|
+        cumulative += value
+        {
+          dimension: i,
+          value: value,
+          percentage: value / total,
+          cumulative_percentage: cumulative / total
+        }
+      end
+    end
+
     # Adds an item to the index. item is assumed to be a string, but
     # any item may be indexed so long as it responds to #to_s or if
     # you provide an optional block explaining how the indexer can
@@ -177,6 +197,8 @@ def items
     #
     # @rbs (?Float) -> void
     def build_index(cutoff = 0.75)
+      validate_cutoff!(cutoff)
+
       synchronize do
         return unless needs_rebuild_unlocked?
 
@@ -295,12 +317,10 @@ def find_related(doc, max_nearest = 3, &block)
     # find_related function to find related documents, then returns the
     # most obvious category from this list.
     #
-    # cutoff signifies the number of documents to consider when clasifying
-    # text. A cutoff of 1 means that every document in the index votes on
-    # what category the document is in. This may not always make sense.
-    #
     # @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
     def classify(doc, cutoff = 0.30, &block)
+      validate_cutoff!(cutoff)
+
       synchronize do
         votes = vote_unlocked(doc, cutoff, &block)
 
@@ -311,6 +331,8 @@ def classify(doc, cutoff = 0.30, &block)
 
     # @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
     def vote(doc, cutoff = 0.30, &block)
+      validate_cutoff!(cutoff)
+
       synchronize { vote_unlocked(doc, cutoff, &block) }
     end
 
@@ -327,6 +349,8 @@ def vote(doc, cutoff = 0.30, &block)
     # See classify() for argument docs
     # @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
     def classify_with_confidence(doc, cutoff = 0.30, &block)
+      validate_cutoff!(cutoff)
+
       synchronize do
         votes = vote_unlocked(doc, cutoff, &block)
         votes_sum = votes.values.sum
@@ -437,6 +461,13 @@ def self.load(path)
 
     private
 
+    # @rbs (Float) -> void
+    def validate_cutoff!(cutoff)
+      return if cutoff.positive? && cutoff < 1
+
+      raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
+    end
+
     # Assigns LSI vectors using native C extension
     # @rbs (untyped, Array[ContentNode]) -> void
     def assign_native_ext_lsi_vectors(ntdm, doc_list)
@@ -536,8 +567,10 @@ def build_reduced_matrix(matrix, cutoff = 0.75)
       # TODO: Check that M>=N on these dimensions! Transpose helps assure this
       u, v, s = matrix.SV_decomp
 
-      # TODO: Better than 75% term, please. :\
-      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      @singular_values = s.sort.reverse
+
+      s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
+      s_cutoff = @singular_values[s_cutoff_index]
       s.size.times do |ord|
         s[ord] = 0.0 if s[ord] < s_cutoff
       end
diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
@@ -489,4 +489,195 @@ def test_save_load_search_functionality
       assert_equal lsi.search('dog', 3), loaded.search('dog', 3)
     end
   end
+
+  # Cutoff parameter validation tests (Issue #67)
+
+  def test_build_index_cutoff_validation_too_low
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    assert_raises(ArgumentError) { lsi.build_index(0.0) }
+    assert_raises(ArgumentError) { lsi.build_index(-0.5) }
+  end
+
+  def test_build_index_cutoff_validation_too_high
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    assert_raises(ArgumentError) { lsi.build_index(1.0) }
+    assert_raises(ArgumentError) { lsi.build_index(1.5) }
+  end
+
+  def test_build_index_cutoff_validation_valid_range
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    # Should not raise for valid cutoffs
+    lsi.build_index(0.01)
+    lsi.build_index(0.5)
+    lsi.build_index(0.99)
+  end
+
+  def test_build_index_very_small_cutoff_no_negative_index
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    lsi.build_index(0.01)
+
+    assert_equal 'Dog', lsi.classify(@str1)
+    refute_nil lsi.singular_values
+  end
+
+  def test_classify_cutoff_validation
+    lsi = Classifier::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    assert_raises(ArgumentError) { lsi.classify(@str1, 0.0) }
+    assert_raises(ArgumentError) { lsi.classify(@str1, 1.0) }
+    assert_raises(ArgumentError) { lsi.classify(@str1, -0.1) }
+    assert_raises(ArgumentError) { lsi.classify(@str1, 1.5) }
+  end
+
+  def test_vote_cutoff_validation
+    lsi = Classifier::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    assert_raises(ArgumentError) { lsi.vote(@str1, 0.0) }
+    assert_raises(ArgumentError) { lsi.vote(@str1, 1.0) }
+  end
+
+  def test_classify_with_confidence_cutoff_validation
+    lsi = Classifier::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+
+    assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 0.0) }
+    assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 1.0) }
+  end
+
+  # Singular value introspection tests (Issue #67)
+
+  def test_singular_values_nil_before_build
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+
+    assert_nil lsi.singular_values
+  end
+
+  def test_singular_values_populated_after_build
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.build_index
+
+    refute_nil lsi.singular_values
+    assert_instance_of Array, lsi.singular_values
+    assert(lsi.singular_values.all? { |v| v.is_a?(Numeric) })
+    assert_predicate lsi.singular_values.size, :positive?
+  end
+
+  def test_singular_values_sorted_descending
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+    lsi.build_index
+
+    values = lsi.singular_values
+    sorted = values.sort.reverse
+
+    assert_equal sorted, values, 'Singular values should be sorted in descending order'
+  end
+
+  def test_singular_value_spectrum_nil_before_build
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+
+    assert_nil lsi.singular_value_spectrum
+  end
+
+  def test_singular_value_spectrum_structure
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+    lsi.build_index
+
+    spectrum = lsi.singular_value_spectrum
+
+    refute_nil spectrum
+    assert_instance_of Array, spectrum
+
+    # Each entry should have required keys
+    spectrum.each_with_index do |entry, i|
+      assert_equal i, entry[:dimension]
+      assert entry.key?(:value)
+      assert entry.key?(:percentage)
+      assert entry.key?(:cumulative_percentage)
+    end
+  end
+
+  def test_singular_value_spectrum_percentages
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+    lsi.build_index
+
+    spectrum = lsi.singular_value_spectrum
+
+    # Individual percentages should sum to 1
+    total_pct = spectrum.sum { |e| e[:percentage] }
+
+    assert_in_delta 1.0, total_pct, 0.001
+
+    # Cumulative should reach 1.0 at the end
+    assert_in_delta 1.0, spectrum.last[:cumulative_percentage], 0.001
+
+    # Cumulative should be monotonically increasing
+    spectrum.each_cons(2) do |a, b|
+      assert_operator a[:cumulative_percentage], :<=, b[:cumulative_percentage]
+    end
+  end
+
+  def test_singular_value_spectrum_for_tuning
+    lsi = Classifier::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+    lsi.build_index
+
+    spectrum = lsi.singular_value_spectrum
+
+    # Find how many dimensions capture 75% of variance (the default cutoff)
+    dims_for_threshold = spectrum.find_index { |e| e[:cumulative_percentage] >= 0.75 }
+
+    # This should be usable for tuning decisions
+    refute_nil dims_for_threshold, 'Should be able to find dimensions for 75% variance'
+    assert_operator dims_for_threshold, :<, spectrum.size, 'Some dimensions should be below 75% threshold'
+  end
 end