Skip to content

Commit e9695ec

Browse files
committed
fixes and updates
1 parent 89f1da2 commit e9695ec

File tree

3 files changed

+30
-16
lines changed

3 files changed

+30
-16
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="text_preprocessing",
9-
version="0.8.4",
9+
version="1.0",
1010
author="The ARTFL Project",
1111
author_email="clovisgladstone@gmail.com",
1212
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/preprocessor.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python3
22
"""Text Preprocessor"""
33

4-
import json
54
import os
65
import re
76
import sqlite3
@@ -25,7 +24,7 @@
2524
)
2625

2726
import lz4.frame
28-
import rapidjson
27+
import orjson
2928
from multiprocess.pool import Pool
3029
from spacy.tokens import Doc
3130

@@ -287,13 +286,14 @@ def save(self, path):
287286
tokens_to_serialize = {"tokens": [], "metadata": self.metadata}
288287
for token in self:
289288
tokens_to_serialize["tokens"].append((token.text, token.surface_form, token.pos_, token.ext))
290-
with open(path, "w") as output:
291-
json.dump(tokens_to_serialize, output)
289+
with open(path, "wb") as output:
290+
output.write(orjson.dumps(tokens_to_serialize))
292291

293292
def load(self, path):
294293
"""Load tokens from disk"""
295294
with open(path, "r") as input_file:
296-
tokens = json.load(input_file)
295+
data = input_file.read()
296+
tokens = orjson.loads(data)
297297
self.metadata = tokens["metadata"]
298298
self.tokens = deque(Token(t[0], t[1], t[2], t[3]) for t in tokens["tokens"])
299299

@@ -361,7 +361,7 @@ def __init__(
361361
hash_tokens: bool = False,
362362
workers: Optional[int] = None,
363363
post_processing_function: Optional[Callable] = None,
364-
**extra_options, # this is meant to make the constructor accept invalid keywords
364+
**_, # this is meant to make the constructor accept invalid keywords
365365
):
366366
cls.language = language
367367
cls.is_philo_db = is_philo_db
@@ -502,7 +502,7 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
502502
open_file = open
503503
with open_file(text) as philo_db_text:
504504
for line in philo_db_text:
505-
word_obj: Dict[str, Any] = rapidjson.loads(line.strip())
505+
word_obj: Dict[str, Any] = orjson.loads(line.strip())
506506
object_id = " ".join(word_obj["position"].split()[: PHILO_TEXT_OBJECT_TYPE[cls.text_object_type]])
507507
if current_object_id == "":
508508
current_object_id = object_id

text_preprocessing/spacy_helpers.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,32 @@
1313
from modernizer import Modernizer
1414

1515

16-
# Updated as of 3/22/2021
16+
# Updated as of 8/23/2022
1717
SPACY_LANGUAGE_MODEL_MAP: Dict[str, List[str]] = {
18+
"catalan": ["ca_core_news_sm", "ca_core_news_md", "ca_core_news_lg", "ca_core_news_trf"],
19+
"chinese": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"],
20+
"croation": ["hr_core_news_sm", "hr_core_news_md", "hr_core_news_lg"],
1821
"danish": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"],
22+
"dutch": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
23+
"english": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
24+
"finnish": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"],
1925
"german": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
2026
"greek": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"],
21-
"english": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
22-
"spanish": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
2327
"french": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
2428
"italian": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
2529
"japanese": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"],
30+
"korean": ["ko_core_news_sm", "ko_core_news_md", "ko_core_news_lg"],
2631
"lithuanian": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"],
27-
"norwegian bokmål": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"],
28-
"dutch": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
32+
"macedonian": ["mk_core_news_sm", "mk_core_news_md", "mk_core_news_lg"],
33+
"norwegian": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"],
2934
"polish": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"],
3035
"portuguese": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"],
3136
"romanian": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"],
3237
"russian": ["ru_core_news_sm", "ru_core_news_md", "ru_core_news_lg"],
38+
"spanish": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
39+
"swedish": ["sv_core_news_sm", "sv_core_news_md", "sv_core_news_lg"],
40+
"ukrainian": ["uk_core_news_sm", "uk_core_news_md", "uk_core_news_lg"],
3341
"multi-language": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
34-
"chinese": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"],
3542
}
3643

3744

@@ -343,13 +350,18 @@ def load_language_model(
343350
nlp.add_pipe("normalizer", config={"language": language, **normalizer_config})
344351
if ngram_config["ngram_window"] != 0:
345352
nlp.add_pipe("ngram_generator", config=ngram_config)
346-
print(ngram_config, nlp.pipe_names)
347353
return nlp
348354

349355

350356
if __name__ == "__main__":
351357
nlp = load_language_model(
352358
"french",
359+
{
360+
"language": "french",
361+
"modernize": Modernizer("french"),
362+
"strip_tags": False,
363+
"token_regex": re.compile(rf"(\w+)|([^\w+])"),
364+
},
353365
{
354366
"convert_entities": True,
355367
"lowercase": True,
@@ -361,7 +373,9 @@ def load_language_model(
361373
"min_word_length": 1,
362374
"stopwords": None,
363375
},
364-
filter_config={"pos_to_keep": ["NOUN", "ADJ"], "ents_to_keep": ["PER", "LOC"]},
376+
{"pos_to_keep": ["NOUN", "ADJ"], "ents_to_keep": ["PER", "LOC"]},
377+
{"ngram_window": 0, "ngram_word_order": True},
378+
False,
365379
)
366380
s = """Comme pour « l’incident » survenu sur l’aérodrome de Saky, Kiev n’a pas revendiqué d’attaque sur Djankoï, un conseiller présidentiel, Mykhaïlo Podoliak, se contentant de confirmer l’explosion. Un responsable ukrainien a cependant affirmé au New York Times, sous couvert d’anonymat, qu’une unité militaire d’élite ukrainienne opérant derrière les lignes ennemies était à l’origine de l’attaque. Les responsables ukrainiens ont aussi prévenu mardi que la Crimée ne serait pas épargnée par les ravages de la guerre."""
367381
doc = nlp(s)

0 commit comments

Comments
 (0)