Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions sotawhat/sotawhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@

import nltk
from nltk.tokenize import word_tokenize
from six.moves.html_parser import HTMLParser
from spellchecker import SpellChecker

try:
from html import unescape # Python >= 3.9
except ImportError:
from html.parser import HTMLParser
unescape = HTMLParser().unescape

try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')

h = HTMLParser()

AUTHOR_TAG = '<a href="/search/?searchtype=author'
TITLE_TAG = '<p class="title is-5 mathjax">'
ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
Expand Down Expand Up @@ -183,9 +186,9 @@ def extract_line(abstract, keyword, limit):

def get_report(paper, keyword):
if keyword in paper['abstract'].lower():
title = h.unescape(paper['title'])
title = unescape(paper['title'])
headline = '{} ({} - {})\n'.format(title, paper['authors'][0], paper['date'])
abstract = h.unescape(paper['abstract'])
abstract = unescape(paper['abstract'])
extract, has_number = extract_line(abstract, keyword, 280 - len(headline))
if extract:
report = headline + extract + '\nLink: {}'.format(paper['main_page'])
Expand Down