99import pickle
1010import requests
1111import functools
12+ from io import BytesIO
1213from pathlib import Path
13- from typing import Iterable , Tuple , List
14+ from typing import Iterable , Tuple , List , Optional
1415
1516import numpy as np
1617import pandas as pd
3637 "US_ALL" : "^GSPC" ,
3738 "IN_ALL" : "^NSEI" ,
3839 "BR_ALL" : "^BVSP" ,
40+ "JP_ALL" : "^N225" ,
3941}
4042
43+ JPX_LISTED_COMPANIES_URL = "https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls"
44+
4145_BENCH_CALENDAR_LIST = None
4246_ALL_CALENDAR_LIST = None
4347_HS_SYMBOLS = None
4448_US_SYMBOLS = None
4549_IN_SYMBOLS = None
4650_BR_SYMBOLS = None
51+ _JP_SYMBOLS = None
4752_EN_FUND_SYMBOLS = None
4853_CALENDAR_MAP = {}
4954
5055# NOTE: Until 2020-10-20 20:00:00
5156MINIMUM_SYMBOLS_NUM = 3900
5257
5358
59+ def _normalize_calendar_timestamp (value ) -> pd .Timestamp :
60+ ts = pd .Timestamp (value )
61+ if ts .tzinfo is not None :
62+ ts = ts .tz_localize (None )
63+ return ts .normalize ()
64+
65+
5466def get_calendar_list (bench_code = "CSI300" ) -> List [pd .Timestamp ]:
5567 """get SH/SZ history calendar list
5668
5769 Parameters
5870 ----------
5971 bench_code: str
60- value from ["CSI300", "CSI500", "ALL", "US_ALL"]
72+ value from ["CSI300", "CSI500", "ALL", "US_ALL", "IN_ALL", "BR_ALL", "JP_ALL" ]
6173
6274 Returns
6375 -------
@@ -72,11 +84,15 @@ def _get_calendar(url):
7284
7385 calendar = _CALENDAR_MAP .get (bench_code , None )
7486 if calendar is None :
75- if bench_code .startswith ("US_" ) or bench_code .startswith ("IN_" ) or bench_code .startswith ("BR_" ):
76- print (Ticker (CALENDAR_BENCH_URL_MAP [bench_code ]))
77- print (Ticker (CALENDAR_BENCH_URL_MAP [bench_code ]).history (interval = "1d" , period = "max" ))
78- df = Ticker (CALENDAR_BENCH_URL_MAP [bench_code ]).history (interval = "1d" , period = "max" )
79- calendar = df .index .get_level_values (level = "date" ).map (pd .Timestamp ).unique ().tolist ()
87+ if (
88+ bench_code .startswith ("US_" )
89+ or bench_code .startswith ("IN_" )
90+ or bench_code .startswith ("BR_" )
91+ or bench_code .startswith ("JP_" )
92+ ):
93+ _ticker = Ticker (CALENDAR_BENCH_URL_MAP [bench_code ])
94+ df = _ticker .history (interval = "1d" , period = "max" )
95+ calendar = sorted ({_normalize_calendar_timestamp (v ) for v in df .index .get_level_values (level = "date" )})
8096 else :
8197 if bench_code .upper () == "ALL" :
8298 import akshare as ak # pylint: disable=C0415
@@ -448,6 +464,106 @@ def _format(s_):
448464 return _BR_SYMBOLS
449465
450466
467+ def _normalize_jpx_column_name (col_name : str ) -> str :
468+ return str (col_name ).replace (" " , "" ).replace ("\u3000 " , "" ).replace ("\n " , "" ).strip ().lower ()
469+
470+
471+ def _find_jpx_column (columns : list , exact_candidates : list , keyword_candidates : list ) -> Optional [str ]:
472+ normalized_map = {col : _normalize_jpx_column_name (col ) for col in columns }
473+ exact_candidates = {_normalize_jpx_column_name (col ) for col in exact_candidates }
474+ keyword_candidates = [_normalize_jpx_column_name (col ) for col in keyword_candidates ]
475+
476+ for _col , _normalized_col in normalized_map .items ():
477+ if _normalized_col in exact_candidates :
478+ return _col
479+
480+ for _col , _normalized_col in normalized_map .items ():
481+ if all (_keyword in _normalized_col for _keyword in keyword_candidates ):
482+ return _col
483+
484+ return None
485+
486+
487+ def _extract_jp_prime_symbols (df : pd .DataFrame ) -> list :
488+ if df is None or df .empty :
489+ raise ValueError ("JPX listed companies file is empty" )
490+
491+ code_col = _find_jpx_column (
492+ columns = df .columns .tolist (),
493+ exact_candidates = ["コード" , "銘柄コード" , "code" , "securitycode" ],
494+ keyword_candidates = ["コード" ],
495+ )
496+ if code_col is None :
497+ raise ValueError ("Unable to find stock code column in JPX listed companies file" )
498+
499+ market_col = _find_jpx_column (
500+ columns = df .columns .tolist (),
501+ exact_candidates = ["市場・商品区分" , "市場商品区分" , "市場区分" , "marketsegment" ],
502+ keyword_candidates = ["市場" , "区分" ],
503+ )
504+ if market_col is None :
505+ raise ValueError ("Unable to find market classification column in JPX listed companies file" )
506+
507+ domestic_col = _find_jpx_column (
508+ columns = df .columns .tolist (),
509+ exact_candidates = ["内外株式区分" , "内外区分" , "domesticforeign" ],
510+ keyword_candidates = ["内外" , "区分" ],
511+ )
512+
513+ market_series = df [market_col ].astype (str )
514+ prime_mask = market_series .str .contains ("プライム" , na = False )
515+
516+ if market_series .str .contains ("内国株式" , na = False ).any ():
517+ domestic_mask = market_series .str .contains ("内国株式" , na = False )
518+ elif domestic_col is not None :
519+ domestic_mask = df [domestic_col ].astype (str ).str .contains ("内国株式" , na = False )
520+ else :
521+ domestic_mask = market_series .str .contains ("内国株式" , na = False )
522+
523+ target_df = df .loc [prime_mask & domestic_mask , [code_col ]].copy ()
524+ if target_df .empty :
525+ raise ValueError ("No JPX Prime domestic stocks found in listed companies file" )
526+
527+ symbols = (
528+ target_df [code_col ]
529+ .astype (str )
530+ .str .extract (r"(\d{4})" , expand = False )
531+ .dropna ()
532+ .apply (lambda code : f"{ code } .T" )
533+ .drop_duplicates ()
534+ .sort_values ()
535+ .tolist ()
536+ )
537+ if not symbols :
538+ raise ValueError ("No valid JP stock symbols extracted from JPX listed companies file" )
539+ return symbols
540+
541+
542+ def get_jp_stock_symbols () -> list :
543+ """get JP Prime (domestic stock) symbols"""
544+
545+ global _JP_SYMBOLS # pylint: disable=W0603
546+
547+ @deco_retry
548+ def _get_jpx_listed_companies_df ():
549+ resp = requests .get (JPX_LISTED_COMPANIES_URL , timeout = None )
550+ if resp .status_code != 200 :
551+ raise ValueError (f"request error, status_code={ resp .status_code } " )
552+ try :
553+ return pd .read_excel (BytesIO (resp .content ), dtype = str )
554+ except Exception as excel_error :
555+ try :
556+ return pd .read_html (BytesIO (resp .content ))[0 ].astype (str )
557+ except Exception as html_error :
558+ raise ValueError (
559+ f"failed to parse JPX listed companies file: excel_error={ excel_error } , html_error={ html_error } "
560+ ) from html_error
561+
562+ if _JP_SYMBOLS is None :
563+ _JP_SYMBOLS = _extract_jp_prime_symbols (_get_jpx_listed_companies_df ())
564+ return _JP_SYMBOLS
565+
566+
451567def get_en_fund_symbols (qlib_data_path : [str , Path ] = None ) -> list :
452568 """get en fund symbols
453569
0 commit comments