Skip to content

Commit b562e6c

Browse files
authored
fix(lsi): improve sentence and paragraph splitting in Summary (#98)
Sentence splitting now handles common edge cases: - Abbreviations (Mr., Dr., Inc., Corp., etc.) - Decimal numbers ($3.50) - Sentences without trailing spaces Also supports pragmatic_segmenter gem for higher accuracy when installed. Paragraph splitting improved to handle: - Unix (\n\n) and Windows (\r\n\r\n) line endings - Multiple consecutive newlines Closes #68
1 parent ac7d056 commit b562e6c

File tree

3 files changed

+153
-5
lines changed

3 files changed

+153
-5
lines changed

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,21 @@ lsi.search "programming", 3
142142
# => ["Ruby is a dynamic programming language", "Python is great for..."]
143143
```
144144

145+
### Text Summarization
146+
147+
LSI can extract key sentences from text:
148+
149+
```ruby
150+
text = "First sentence about dogs. Second about cats. Third about birds."
151+
text.summary(2) # Extract 2 most relevant sentences
152+
```
153+
154+
For better sentence boundary detection (handles abbreviations like "Dr.", decimals, etc.), install the optional `pragmatic_segmenter` gem:
155+
156+
```ruby
157+
gem 'pragmatic_segmenter'
158+
```
159+
145160
### Learn More
146161

147162
- [LSI Basics Guide](https://rubyclassifier.com/docs/guides/lsi/basics) - In-depth documentation

lib/classifier/lsi/summary.rb

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# License:: LGPL
44

55
class String
6+
ABBREVIATIONS = %w[Mr Mrs Ms Dr Prof Jr Sr Inc Ltd Corp Co vs etc al eg ie].freeze
7+
68
def summary(count = 10, separator = ' [...] ')
79
perform_lsi split_sentences, count, separator
810
end
@@ -12,20 +14,38 @@ def paragraph_summary(count = 1, separator = ' [...] ')
1214
end
1315

1416
def split_sentences
15-
split(/(\.|!|\?)/) # TODO: make this less primitive
17+
return pragmatic_segment if defined?(PragmaticSegmenter)
18+
19+
split_sentences_regex
1620
end
1721

1822
def split_paragraphs
19-
split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
23+
split(/\r?\n\r?\n+/)
2024
end
2125

2226
private
2327

28+
def pragmatic_segment
29+
PragmaticSegmenter::Segmenter.new(text: self).segment
30+
end
31+
32+
def split_sentences_regex
33+
abbrev_pattern = ABBREVIATIONS.map { |a| "#{a}\\." }.join('|')
34+
text = gsub(/\b(#{abbrev_pattern})/i) { |m| m.gsub('.', '<<<DOT>>>') }
35+
text = text.gsub(/(\d)\.(\d)/, '\1<<<DOT>>>\2')
36+
sentences = text.split(/(?<=[.!?])(?:\s+|(?=[A-Z]))/)
37+
sentences.map { |s| s.gsub('<<<DOT>>>', '.') }
38+
end
39+
2440
def perform_lsi(chunks, count, separator)
2541
lsi = Classifier::LSI.new auto_rebuild: false
26-
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
42+
chunks.each do |chunk|
43+
stripped = chunk.strip
44+
next if stripped.empty? || stripped.split.size == 1
45+
46+
lsi << chunk
47+
end
2748
lsi.build_index
28-
summaries = lsi.highest_relative_content count
29-
summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
49+
lsi.highest_relative_content(count).map(&:strip).join(separator)
3050
end
3151
end

test/lsi/summary_test.rb

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
require_relative '../test_helper'
2+
3+
class SummaryTest < Minitest::Test
4+
def test_split_sentences_basic
5+
text = 'Hello world. This is a test. How are you?'
6+
sentences = text.split_sentences
7+
8+
assert_equal 3, sentences.size
9+
assert_equal 'Hello world.', sentences[0]
10+
assert_equal 'This is a test.', sentences[1]
11+
assert_equal 'How are you?', sentences[2]
12+
end
13+
14+
def test_split_sentences_with_abbreviations
15+
text = 'Dr. Smith went to the store. He bought milk.'
16+
sentences = text.split_sentences
17+
18+
assert_equal 2, sentences.size
19+
assert_equal 'Dr. Smith went to the store.', sentences[0]
20+
assert_equal 'He bought milk.', sentences[1]
21+
end
22+
23+
def test_split_sentences_with_mr_mrs
24+
text = 'Mr. Jones met Mrs. Smith. They talked.'
25+
sentences = text.split_sentences
26+
27+
assert_equal 2, sentences.size
28+
assert_equal 'Mr. Jones met Mrs. Smith.', sentences[0]
29+
assert_equal 'They talked.', sentences[1]
30+
end
31+
32+
def test_split_sentences_with_decimals
33+
text = 'The price is $3.50 per unit. That is expensive.'
34+
sentences = text.split_sentences
35+
36+
assert_equal 2, sentences.size
37+
assert_equal 'The price is $3.50 per unit.', sentences[0]
38+
assert_equal 'That is expensive.', sentences[1]
39+
end
40+
41+
def test_split_sentences_with_exclamation
42+
text = 'Hello! How are you? I am fine.'
43+
sentences = text.split_sentences
44+
45+
assert_equal 3, sentences.size
46+
assert_equal 'Hello!', sentences[0]
47+
assert_equal 'How are you?', sentences[1]
48+
assert_equal 'I am fine.', sentences[2]
49+
end
50+
51+
def test_split_sentences_with_inc_corp
52+
text = 'Apple Inc. makes phones. Microsoft Corp. makes software.'
53+
sentences = text.split_sentences
54+
55+
assert_equal 2, sentences.size
56+
assert_equal 'Apple Inc. makes phones.', sentences[0]
57+
assert_equal 'Microsoft Corp. makes software.', sentences[1]
58+
end
59+
60+
def test_split_sentences_with_etc
61+
text = 'We need apples, oranges, etc. for the party. Please bring them.'
62+
sentences = text.split_sentences
63+
64+
assert_equal 2, sentences.size
65+
assert_includes sentences[0], 'etc.'
66+
end
67+
68+
def test_split_paragraphs_double_newline
69+
text = "First paragraph.\n\nSecond paragraph."
70+
paragraphs = text.split_paragraphs
71+
72+
assert_equal 2, paragraphs.size
73+
assert_equal 'First paragraph.', paragraphs[0]
74+
assert_equal 'Second paragraph.', paragraphs[1]
75+
end
76+
77+
def test_split_paragraphs_windows_line_endings
78+
text = "First paragraph.\r\n\r\nSecond paragraph."
79+
paragraphs = text.split_paragraphs
80+
81+
assert_equal 2, paragraphs.size
82+
assert_equal 'First paragraph.', paragraphs[0]
83+
assert_equal 'Second paragraph.', paragraphs[1]
84+
end
85+
86+
def test_split_paragraphs_multiple_newlines
87+
text = "First paragraph.\n\n\n\nSecond paragraph."
88+
paragraphs = text.split_paragraphs
89+
90+
assert_equal 2, paragraphs.size
91+
end
92+
93+
def test_split_paragraphs_mixed_line_endings
94+
text = "First.\r\n\r\nSecond.\n\nThird."
95+
paragraphs = text.split_paragraphs
96+
97+
assert_equal 3, paragraphs.size
98+
end
99+
100+
def test_summary_returns_string
101+
text = 'This is sentence one. This is sentence two. This is sentence three.'
102+
result = text.summary(2)
103+
104+
assert_instance_of String, result
105+
end
106+
107+
def test_paragraph_summary_returns_string
108+
text = "First paragraph with content.\n\nSecond paragraph with more content."
109+
result = text.paragraph_summary(1)
110+
111+
assert_instance_of String, result
112+
end
113+
end

0 commit comments

Comments
 (0)