fix(lsi): improve sentence and paragraph splitting in Summary (#98)

cardmagic · web-flow · commit b562e6cc3dcb · 2025-12-28T14:48:57.000-08:00
Sentence splitting now handles common edge cases: - Abbreviations (Mr., Dr., Inc., Corp., etc.) - Decimal numbers ($3.50) - Sentences without trailing spaces Also supports pragmatic_segmenter gem for higher accuracy when installed. Paragraph splitting improved to handle: - Unix (\n\n) and Windows (\r\n\r\n) line endings - Multiple consecutive newlines Closes #68
diff --git a/README.md b/README.md
@@ -142,6 +142,21 @@ lsi.search "programming", 3
 # => ["Ruby is a dynamic programming language", "Python is great for..."]
 ```
 
+### Text Summarization
+
+LSI can extract key sentences from text:
+
+```ruby
+text = "First sentence about dogs. Second about cats. Third about birds."
+text.summary(2)  # Extract 2 most relevant sentences
+```
+
+For better sentence boundary detection (handles abbreviations like "Dr.", decimals, etc.), install the optional `pragmatic_segmenter` gem:
+
+```ruby
+gem 'pragmatic_segmenter'
+```
+
 ### Learn More
 
 - [LSI Basics Guide](https://rubyclassifier.com/docs/guides/lsi/basics) - In-depth documentation
diff --git a/lib/classifier/lsi/summary.rb b/lib/classifier/lsi/summary.rb
@@ -3,6 +3,8 @@
 # License::   LGPL
 
 class String
+  ABBREVIATIONS = %w[Mr Mrs Ms Dr Prof Jr Sr Inc Ltd Corp Co vs etc al eg ie].freeze
+
   def summary(count = 10, separator = ' [...] ')
     perform_lsi split_sentences, count, separator
   end
@@ -12,20 +14,38 @@ def paragraph_summary(count = 1, separator = ' [...] ')
   end
 
   def split_sentences
-    split(/(\.|!|\?)/) # TODO: make this less primitive
+    return pragmatic_segment if defined?(PragmaticSegmenter)
+
+    split_sentences_regex
   end
 
   def split_paragraphs
-    split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
+    split(/\r?\n\r?\n+/)
   end
 
   private
 
+  def pragmatic_segment
+    PragmaticSegmenter::Segmenter.new(text: self).segment
+  end
+
+  def split_sentences_regex
+    abbrev_pattern = ABBREVIATIONS.map { |a| "#{a}\\." }.join('|')
+    text = gsub(/\b(#{abbrev_pattern})/i) { |m| m.gsub('.', '<<<DOT>>>') }
+    text = text.gsub(/(\d)\.(\d)/, '\1<<<DOT>>>\2')
+    sentences = text.split(/(?<=[.!?])(?:\s+|(?=[A-Z]))/)
+    sentences.map { |s| s.gsub('<<<DOT>>>', '.') }
+  end
+
   def perform_lsi(chunks, count, separator)
     lsi = Classifier::LSI.new auto_rebuild: false
-    chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+    chunks.each do |chunk|
+      stripped = chunk.strip
+      next if stripped.empty? || stripped.split.size == 1
+
+      lsi << chunk
+    end
     lsi.build_index
-    summaries = lsi.highest_relative_content count
-    summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
+    lsi.highest_relative_content(count).map(&:strip).join(separator)
   end
 end
diff --git a/test/lsi/summary_test.rb b/test/lsi/summary_test.rb
@@ -0,0 +1,113 @@
+require_relative '../test_helper'
+
+class SummaryTest < Minitest::Test
+  def test_split_sentences_basic
+    text = 'Hello world. This is a test. How are you?'
+    sentences = text.split_sentences
+
+    assert_equal 3, sentences.size
+    assert_equal 'Hello world.', sentences[0]
+    assert_equal 'This is a test.', sentences[1]
+    assert_equal 'How are you?', sentences[2]
+  end
+
+  def test_split_sentences_with_abbreviations
+    text = 'Dr. Smith went to the store. He bought milk.'
+    sentences = text.split_sentences
+
+    assert_equal 2, sentences.size
+    assert_equal 'Dr. Smith went to the store.', sentences[0]
+    assert_equal 'He bought milk.', sentences[1]
+  end
+
+  def test_split_sentences_with_mr_mrs
+    text = 'Mr. Jones met Mrs. Smith. They talked.'
+    sentences = text.split_sentences
+
+    assert_equal 2, sentences.size
+    assert_equal 'Mr. Jones met Mrs. Smith.', sentences[0]
+    assert_equal 'They talked.', sentences[1]
+  end
+
+  def test_split_sentences_with_decimals
+    text = 'The price is $3.50 per unit. That is expensive.'
+    sentences = text.split_sentences
+
+    assert_equal 2, sentences.size
+    assert_equal 'The price is $3.50 per unit.', sentences[0]
+    assert_equal 'That is expensive.', sentences[1]
+  end
+
+  def test_split_sentences_with_exclamation
+    text = 'Hello! How are you? I am fine.'
+    sentences = text.split_sentences
+
+    assert_equal 3, sentences.size
+    assert_equal 'Hello!', sentences[0]
+    assert_equal 'How are you?', sentences[1]
+    assert_equal 'I am fine.', sentences[2]
+  end
+
+  def test_split_sentences_with_inc_corp
+    text = 'Apple Inc. makes phones. Microsoft Corp. makes software.'
+    sentences = text.split_sentences
+
+    assert_equal 2, sentences.size
+    assert_equal 'Apple Inc. makes phones.', sentences[0]
+    assert_equal 'Microsoft Corp. makes software.', sentences[1]
+  end
+
+  def test_split_sentences_with_etc
+    text = 'We need apples, oranges, etc. for the party. Please bring them.'
+    sentences = text.split_sentences
+
+    assert_equal 2, sentences.size
+    assert_includes sentences[0], 'etc.'
+  end
+
+  def test_split_paragraphs_double_newline
+    text = "First paragraph.\n\nSecond paragraph."
+    paragraphs = text.split_paragraphs
+
+    assert_equal 2, paragraphs.size
+    assert_equal 'First paragraph.', paragraphs[0]
+    assert_equal 'Second paragraph.', paragraphs[1]
+  end
+
+  def test_split_paragraphs_windows_line_endings
+    text = "First paragraph.\r\n\r\nSecond paragraph."
+    paragraphs = text.split_paragraphs
+
+    assert_equal 2, paragraphs.size
+    assert_equal 'First paragraph.', paragraphs[0]
+    assert_equal 'Second paragraph.', paragraphs[1]
+  end
+
+  def test_split_paragraphs_multiple_newlines
+    text = "First paragraph.\n\n\n\nSecond paragraph."
+    paragraphs = text.split_paragraphs
+
+    assert_equal 2, paragraphs.size
+  end
+
+  def test_split_paragraphs_mixed_line_endings
+    text = "First.\r\n\r\nSecond.\n\nThird."
+    paragraphs = text.split_paragraphs
+
+    assert_equal 3, paragraphs.size
+  end
+
+  def test_summary_returns_string
+    text = 'This is sentence one. This is sentence two. This is sentence three.'
+    result = text.summary(2)
+
+    assert_instance_of String, result
+  end
+
+  def test_paragraph_summary_returns_string
+    text = "First paragraph with content.\n\nSecond paragraph with more content."
+    result = text.paragraph_summary(1)
+
+    assert_instance_of String, result
+  end
+end