1313from modernizer import Modernizer
1414
1515
16- # Updated as of 3/22/2021
16+ # Updated as of 8/23/2022
1717SPACY_LANGUAGE_MODEL_MAP : Dict [str , List [str ]] = {
18+ "catalan" : ["ca_core_news_sm" , "ca_core_news_md" , "ca_core_news_lg" , "ca_core_news_trf" ],
19+ "chinese" : ["zh_core_web_sm" , "zh_core_web_md" , "zh_core_web_lg" , "zh_core_web_trf" ],
20+ "croation" : ["hr_core_news_sm" , "hr_core_news_md" , "hr_core_news_lg" ],
1821 "danish" : ["da_core_news_sm" , "da_core_news_md" , "da_core_news_lg" ],
22+ "dutch" : ["nl_core_news_sm" , "nl_core_news_md" , "nl_core_news_lg" ],
23+ "english" : ["en_core_web_sm" , "en_core_web_md" , "en_core_web_lg" , "en_core_web_trf" ],
24+ "finnish" : ["fi_core_news_sm" , "fi_core_news_md" , "fi_core_news_lg" ],
1925 "german" : ["de_core_news_sm" , "de_core_news_md" , "de_core_news_lg" , "de_dep_news_trf" ],
2026 "greek" : ["el_core_news_sm" , "el_core_news_md" , "el_core_news_lg" ],
21- "english" : ["en_core_web_sm" , "en_core_web_md" , "en_core_web_lg" , "en_core_web_trf" ],
22- "spanish" : ["es_core_news_sm" , "es_core_news_md" , "es_core_news_lg" , "es_dep_news_trf" ],
2327 "french" : ["fr_core_news_sm" , "fr_core_news_md" , "fr_core_news_lg" , "fr_dep_news_trf" ],
2428 "italian" : ["it_core_news_sm" , "it_core_news_md" , "it_core_news_lg" ],
2529 "japanese" : ["ja_core_news_sm" , "ja_core_news_md" , "ja_core_news_lg" ],
30+ "korean" : ["ko_core_news_sm" , "ko_core_news_md" , "ko_core_news_lg" ],
2631 "lithuanian" : ["lt_core_news_sm" , "lt_core_news_md" , "lt_core_news_lg" ],
27- "norwegian bokmål " : ["nb_core_news_sm " , "nb_core_news_md " , "nb_core_news_lg " ],
28- "dutch " : ["nl_core_news_sm " , "nl_core_news_md " , "nl_core_news_lg " ],
32+ "macedonian " : ["mk_core_news_sm " , "mk_core_news_md " , "mk_core_news_lg " ],
33+ "norwegian " : ["nb_core_news_sm " , "nb_core_news_md " , "nb_core_news_lg " ],
2934 "polish" : ["pl_core_news_sm" , "pl_core_news_md" , "pl_core_news_lg" ],
3035 "portuguese" : ["pt_core_news_sm" , "pt_core_news_md" , "pt_core_news_lg" ],
3136 "romanian" : ["ro_core_news_sm" , "ro_core_news_md" , "ro_core_news_lg" ],
3237 "russian" : ["ru_core_news_sm" , "ru_core_news_md" , "ru_core_news_lg" ],
38+ "spanish" : ["es_core_news_sm" , "es_core_news_md" , "es_core_news_lg" , "es_dep_news_trf" ],
39+ "swedish" : ["sv_core_news_sm" , "sv_core_news_md" , "sv_core_news_lg" ],
40+ "ukrainian" : ["uk_core_news_sm" , "uk_core_news_md" , "uk_core_news_lg" ],
3341 "multi-language" : ["xx_ent_wiki_sm" , "xx_sent_ud_sm" ],
34- "chinese" : ["zh_core_web_sm" , "zh_core_web_md" , "zh_core_web_lg" , "zh_core_web_trf" ],
3542}
3643
3744
@@ -343,13 +350,18 @@ def load_language_model(
343350 nlp .add_pipe ("normalizer" , config = {"language" : language , ** normalizer_config })
344351 if ngram_config ["ngram_window" ] != 0 :
345352 nlp .add_pipe ("ngram_generator" , config = ngram_config )
346- print (ngram_config , nlp .pipe_names )
347353 return nlp
348354
349355
350356if __name__ == "__main__" :
351357 nlp = load_language_model (
352358 "french" ,
359+ {
360+ "language" : "french" ,
361+ "modernize" : Modernizer ("french" ),
362+ "strip_tags" : False ,
363+ "token_regex" : re .compile (rf"(\w+)|([^\w+])" ),
364+ },
353365 {
354366 "convert_entities" : True ,
355367 "lowercase" : True ,
@@ -361,7 +373,9 @@ def load_language_model(
361373 "min_word_length" : 1 ,
362374 "stopwords" : None ,
363375 },
364- filter_config = {"pos_to_keep" : ["NOUN" , "ADJ" ], "ents_to_keep" : ["PER" , "LOC" ]},
376+ {"pos_to_keep" : ["NOUN" , "ADJ" ], "ents_to_keep" : ["PER" , "LOC" ]},
377+ {"ngram_window" : 0 , "ngram_word_order" : True },
378+ False ,
365379 )
366380 s = """Comme pour « l’incident » survenu sur l’aérodrome de Saky, Kiev n’a pas revendiqué d’attaque sur Djankoï, un conseiller présidentiel, Mykhaïlo Podoliak, se contentant de confirmer l’explosion. Un responsable ukrainien a cependant affirmé au New York Times, sous couvert d’anonymat, qu’une unité militaire d’élite ukrainienne opérant derrière les lignes ennemies était à l’origine de l’attaque. Les responsables ukrainiens ont aussi prévenu mardi que la Crimée ne serait pas épargnée par les ravages de la guerre."""
367381 doc = nlp (s )
0 commit comments