Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Metrics/ClassLength:
Max: 250
Exclude:
- 'test/**/*'
- 'lib/classifier/lsi.rb'

# SV_decomp is a standard algorithm name
Naming/MethodName:
Expand Down
47 changes: 40 additions & 7 deletions lib/classifier/lsi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ class LSI
# @rbs @items: Hash[untyped, ContentNode]
# @rbs @version: Integer
# @rbs @built_at_version: Integer
# @rbs @singular_values: Array[Float]?

attr_reader :word_list
attr_reader :word_list, :singular_values
attr_accessor :auto_rebuild

# Create a fresh index.
Expand All @@ -98,6 +99,25 @@ def needs_rebuild?
synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
end

# @rbs () -> Array[Hash[Symbol, untyped]]?
def singular_value_spectrum
return nil unless @singular_values

total = @singular_values.sum
return nil if total.zero?

cumulative = 0.0
@singular_values.map.with_index do |value, i|
cumulative += value
{
dimension: i,
value: value,
percentage: value / total,
cumulative_percentage: cumulative / total
}
end
end

# Adds an item to the index. item is assumed to be a string, but
# any item may be indexed so long as it responds to #to_s or if
# you provide an optional block explaining how the indexer can
Expand Down Expand Up @@ -177,6 +197,8 @@ def items
#
# @rbs (?Float) -> void
def build_index(cutoff = 0.75)
validate_cutoff!(cutoff)

synchronize do
return unless needs_rebuild_unlocked?

Expand Down Expand Up @@ -295,12 +317,10 @@ def find_related(doc, max_nearest = 3, &block)
# find_related function to find related documents, then returns the
# most obvious category from this list.
#
# cutoff signifies the number of documents to consider when clasifying
# text. A cutoff of 1 means that every document in the index votes on
# what category the document is in. This may not always make sense.
#
# @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
def classify(doc, cutoff = 0.30, &block)
validate_cutoff!(cutoff)

synchronize do
votes = vote_unlocked(doc, cutoff, &block)

Expand All @@ -311,6 +331,8 @@ def classify(doc, cutoff = 0.30, &block)

# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
def vote(doc, cutoff = 0.30, &block)
validate_cutoff!(cutoff)

synchronize { vote_unlocked(doc, cutoff, &block) }
end

Expand All @@ -327,6 +349,8 @@ def vote(doc, cutoff = 0.30, &block)
# See classify() for argument docs
# @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
def classify_with_confidence(doc, cutoff = 0.30, &block)
validate_cutoff!(cutoff)

synchronize do
votes = vote_unlocked(doc, cutoff, &block)
votes_sum = votes.values.sum
Expand Down Expand Up @@ -437,6 +461,13 @@ def self.load(path)

private

# @rbs (Float) -> void
def validate_cutoff!(cutoff)
return if cutoff.positive? && cutoff < 1

raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
end

# Assigns LSI vectors using native C extension
# @rbs (untyped, Array[ContentNode]) -> void
def assign_native_ext_lsi_vectors(ntdm, doc_list)
Expand Down Expand Up @@ -536,8 +567,10 @@ def build_reduced_matrix(matrix, cutoff = 0.75)
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
u, v, s = matrix.SV_decomp

# TODO: Better than 75% term, please. :\
s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
@singular_values = s.sort.reverse

s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
s_cutoff = @singular_values[s_cutoff_index]
s.size.times do |ord|
s[ord] = 0.0 if s[ord] < s_cutoff
end
Expand Down
191 changes: 191 additions & 0 deletions test/lsi/lsi_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -489,4 +489,195 @@ def test_save_load_search_functionality
assert_equal lsi.search('dog', 3), loaded.search('dog', 3)
end
end

# Cutoff parameter validation tests (Issue #67)

def test_build_index_cutoff_validation_too_low
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

assert_raises(ArgumentError) { lsi.build_index(0.0) }
assert_raises(ArgumentError) { lsi.build_index(-0.5) }
end

def test_build_index_cutoff_validation_too_high
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

assert_raises(ArgumentError) { lsi.build_index(1.0) }
assert_raises(ArgumentError) { lsi.build_index(1.5) }
end

def test_build_index_cutoff_validation_valid_range
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

# Should not raise for valid cutoffs
lsi.build_index(0.01)
lsi.build_index(0.5)
lsi.build_index(0.99)
end

def test_build_index_very_small_cutoff_no_negative_index
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

lsi.build_index(0.01)

assert_equal 'Dog', lsi.classify(@str1)
refute_nil lsi.singular_values
end

def test_classify_cutoff_validation
lsi = Classifier::LSI.new
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

assert_raises(ArgumentError) { lsi.classify(@str1, 0.0) }
assert_raises(ArgumentError) { lsi.classify(@str1, 1.0) }
assert_raises(ArgumentError) { lsi.classify(@str1, -0.1) }
assert_raises(ArgumentError) { lsi.classify(@str1, 1.5) }
end

def test_vote_cutoff_validation
lsi = Classifier::LSI.new
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

assert_raises(ArgumentError) { lsi.vote(@str1, 0.0) }
assert_raises(ArgumentError) { lsi.vote(@str1, 1.0) }
end

def test_classify_with_confidence_cutoff_validation
lsi = Classifier::LSI.new
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'

assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 0.0) }
assert_raises(ArgumentError) { lsi.classify_with_confidence(@str1, 1.0) }
end

# Singular value introspection tests (Issue #67)

def test_singular_values_nil_before_build
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'

assert_nil lsi.singular_values
end

def test_singular_values_populated_after_build
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'
lsi.build_index

refute_nil lsi.singular_values
assert_instance_of Array, lsi.singular_values
assert(lsi.singular_values.all? { |v| v.is_a?(Numeric) })
assert_predicate lsi.singular_values.size, :positive?
end

def test_singular_values_sorted_descending
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'
lsi.add_item @str4, 'Cat'
lsi.add_item @str5, 'Bird'
lsi.build_index

values = lsi.singular_values
sorted = values.sort.reverse

assert_equal sorted, values, 'Singular values should be sorted in descending order'
end

def test_singular_value_spectrum_nil_before_build
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'

assert_nil lsi.singular_value_spectrum
end

def test_singular_value_spectrum_structure
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'
lsi.add_item @str4, 'Cat'
lsi.add_item @str5, 'Bird'
lsi.build_index

spectrum = lsi.singular_value_spectrum

refute_nil spectrum
assert_instance_of Array, spectrum

# Each entry should have required keys
spectrum.each_with_index do |entry, i|
assert_equal i, entry[:dimension]
assert entry.key?(:value)
assert entry.key?(:percentage)
assert entry.key?(:cumulative_percentage)
end
end

def test_singular_value_spectrum_percentages
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'
lsi.add_item @str4, 'Cat'
lsi.add_item @str5, 'Bird'
lsi.build_index

spectrum = lsi.singular_value_spectrum

# Individual percentages should sum to 1
total_pct = spectrum.sum { |e| e[:percentage] }

assert_in_delta 1.0, total_pct, 0.001

# Cumulative should reach 1.0 at the end
assert_in_delta 1.0, spectrum.last[:cumulative_percentage], 0.001

# Cumulative should be monotonically increasing
spectrum.each_cons(2) do |a, b|
assert_operator a[:cumulative_percentage], :<=, b[:cumulative_percentage]
end
end

def test_singular_value_spectrum_for_tuning
lsi = Classifier::LSI.new auto_rebuild: false
lsi.add_item @str1, 'Dog'
lsi.add_item @str2, 'Dog'
lsi.add_item @str3, 'Cat'
lsi.add_item @str4, 'Cat'
lsi.add_item @str5, 'Bird'
lsi.build_index

spectrum = lsi.singular_value_spectrum

# Find how many dimensions capture 75% of variance (the default cutoff)
dims_for_threshold = spectrum.find_index { |e| e[:cumulative_percentage] >= 0.75 }

# This should be usable for tuning decisions
refute_nil dims_for_threshold, 'Should be able to find dimensions for 75% variance'
assert_operator dims_for_threshold, :<, spectrum.size, 'Some dimensions should be below 75% threshold'
end
end