Skip to content

Commit 8ed6f96

Browse files
committed
Add JP Prime Yahoo collector support
1 parent 2fb9380 commit 8ed6f96

File tree

5 files changed

+370
-14
lines changed

5 files changed

+370
-14
lines changed

scripts/data_collector/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
Scripts for data collection
66

7-
- yahoo: get *US/CN* stock data from *Yahoo Finance*
7+
- yahoo: get *US/CN/IN/BR/JP* stock data from *Yahoo Finance*
88
- fund: get fund data from *http://fund.eastmoney.com*
99
- cn_index: get *CN index* from *http://www.csindex.com.cn*, *CSI300*/*CSI100*
1010
- us_index: get *US index* from *https://en.wikipedia.org/wiki*, *SP500*/*NASDAQ100*/*DJIA*/*SP400*
@@ -57,4 +57,4 @@ Scripts for data collection
5757
| Component | required data |
5858
|---------------------------------------------------|--------------------------------|
5959
| Data retrieval | Features, Calendar, Instrument |
60-
| Backtest | **Features[Price/Volume]**, Calendar, Instruments |
60+
| Backtest | **Features[Price/Volume]**, Calendar, Instruments |

scripts/data_collector/utils.py

Lines changed: 123 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
import pickle
1010
import requests
1111
import functools
12+
from io import BytesIO
1213
from pathlib import Path
13-
from typing import Iterable, Tuple, List
14+
from typing import Iterable, Tuple, List, Optional
1415

1516
import numpy as np
1617
import pandas as pd
@@ -36,28 +37,39 @@
3637
"US_ALL": "^GSPC",
3738
"IN_ALL": "^NSEI",
3839
"BR_ALL": "^BVSP",
40+
"JP_ALL": "^N225",
3941
}
4042

43+
JPX_LISTED_COMPANIES_URL = "https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls"
44+
4145
_BENCH_CALENDAR_LIST = None
4246
_ALL_CALENDAR_LIST = None
4347
_HS_SYMBOLS = None
4448
_US_SYMBOLS = None
4549
_IN_SYMBOLS = None
4650
_BR_SYMBOLS = None
51+
_JP_SYMBOLS = None
4752
_EN_FUND_SYMBOLS = None
4853
_CALENDAR_MAP = {}
4954

5055
# NOTE: Until 2020-10-20 20:00:00
5156
MINIMUM_SYMBOLS_NUM = 3900
5257

5358

59+
def _normalize_calendar_timestamp(value) -> pd.Timestamp:
60+
ts = pd.Timestamp(value)
61+
if ts.tzinfo is not None:
62+
ts = ts.tz_localize(None)
63+
return ts.normalize()
64+
65+
5466
def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
5567
"""get SH/SZ history calendar list
5668
5769
Parameters
5870
----------
5971
bench_code: str
60-
value from ["CSI300", "CSI500", "ALL", "US_ALL"]
72+
value from ["CSI300", "CSI500", "ALL", "US_ALL", "IN_ALL", "BR_ALL", "JP_ALL"]
6173
6274
Returns
6375
-------
@@ -72,11 +84,15 @@ def _get_calendar(url):
7284

7385
calendar = _CALENDAR_MAP.get(bench_code, None)
7486
if calendar is None:
75-
if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
76-
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
77-
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
78-
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
79-
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
87+
if (
88+
bench_code.startswith("US_")
89+
or bench_code.startswith("IN_")
90+
or bench_code.startswith("BR_")
91+
or bench_code.startswith("JP_")
92+
):
93+
_ticker = Ticker(CALENDAR_BENCH_URL_MAP[bench_code])
94+
df = _ticker.history(interval="1d", period="max")
95+
calendar = sorted({_normalize_calendar_timestamp(v) for v in df.index.get_level_values(level="date")})
8096
else:
8197
if bench_code.upper() == "ALL":
8298
import akshare as ak # pylint: disable=C0415
@@ -448,6 +464,106 @@ def _format(s_):
448464
return _BR_SYMBOLS
449465

450466

467+
def _normalize_jpx_column_name(col_name: str) -> str:
468+
return str(col_name).replace(" ", "").replace("\u3000", "").replace("\n", "").strip().lower()
469+
470+
471+
def _find_jpx_column(columns: list, exact_candidates: list, keyword_candidates: list) -> Optional[str]:
472+
normalized_map = {col: _normalize_jpx_column_name(col) for col in columns}
473+
exact_candidates = {_normalize_jpx_column_name(col) for col in exact_candidates}
474+
keyword_candidates = [_normalize_jpx_column_name(col) for col in keyword_candidates]
475+
476+
for _col, _normalized_col in normalized_map.items():
477+
if _normalized_col in exact_candidates:
478+
return _col
479+
480+
for _col, _normalized_col in normalized_map.items():
481+
if all(_keyword in _normalized_col for _keyword in keyword_candidates):
482+
return _col
483+
484+
return None
485+
486+
487+
def _extract_jp_prime_symbols(df: pd.DataFrame) -> list:
488+
if df is None or df.empty:
489+
raise ValueError("JPX listed companies file is empty")
490+
491+
code_col = _find_jpx_column(
492+
columns=df.columns.tolist(),
493+
exact_candidates=["コード", "銘柄コード", "code", "securitycode"],
494+
keyword_candidates=["コード"],
495+
)
496+
if code_col is None:
497+
raise ValueError("Unable to find stock code column in JPX listed companies file")
498+
499+
market_col = _find_jpx_column(
500+
columns=df.columns.tolist(),
501+
exact_candidates=["市場・商品区分", "市場商品区分", "市場区分", "marketsegment"],
502+
keyword_candidates=["市場", "区分"],
503+
)
504+
if market_col is None:
505+
raise ValueError("Unable to find market classification column in JPX listed companies file")
506+
507+
domestic_col = _find_jpx_column(
508+
columns=df.columns.tolist(),
509+
exact_candidates=["内外株式区分", "内外区分", "domesticforeign"],
510+
keyword_candidates=["内外", "区分"],
511+
)
512+
513+
market_series = df[market_col].astype(str)
514+
prime_mask = market_series.str.contains("プライム", na=False)
515+
516+
if market_series.str.contains("内国株式", na=False).any():
517+
domestic_mask = market_series.str.contains("内国株式", na=False)
518+
elif domestic_col is not None:
519+
domestic_mask = df[domestic_col].astype(str).str.contains("内国株式", na=False)
520+
else:
521+
domestic_mask = market_series.str.contains("内国株式", na=False)
522+
523+
target_df = df.loc[prime_mask & domestic_mask, [code_col]].copy()
524+
if target_df.empty:
525+
raise ValueError("No JPX Prime domestic stocks found in listed companies file")
526+
527+
symbols = (
528+
target_df[code_col]
529+
.astype(str)
530+
.str.extract(r"(\d{4})", expand=False)
531+
.dropna()
532+
.apply(lambda code: f"{code}.T")
533+
.drop_duplicates()
534+
.sort_values()
535+
.tolist()
536+
)
537+
if not symbols:
538+
raise ValueError("No valid JP stock symbols extracted from JPX listed companies file")
539+
return symbols
540+
541+
542+
def get_jp_stock_symbols() -> list:
543+
"""get JP Prime (domestic stock) symbols"""
544+
545+
global _JP_SYMBOLS # pylint: disable=W0603
546+
547+
@deco_retry
548+
def _get_jpx_listed_companies_df():
549+
resp = requests.get(JPX_LISTED_COMPANIES_URL, timeout=None)
550+
if resp.status_code != 200:
551+
raise ValueError(f"request error, status_code={resp.status_code}")
552+
try:
553+
return pd.read_excel(BytesIO(resp.content), dtype=str)
554+
except Exception as excel_error:
555+
try:
556+
return pd.read_html(BytesIO(resp.content))[0].astype(str)
557+
except Exception as html_error:
558+
raise ValueError(
559+
f"failed to parse JPX listed companies file: excel_error={excel_error}, html_error={html_error}"
560+
) from html_error
561+
562+
if _JP_SYMBOLS is None:
563+
_JP_SYMBOLS = _extract_jp_prime_symbols(_get_jpx_listed_companies_df())
564+
return _JP_SYMBOLS
565+
566+
451567
def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
452568
"""get en fund symbols
453569

scripts/data_collector/yahoo/README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ pip install -r requirements.txt
5555
### Collector *YahooFinance* data to qlib
5656
> collector *YahooFinance* data and *dump* into `qlib` format.
5757
> If the above ready-made data can't meet users' requirements, users can follow this section to crawl the latest data and convert it to qlib-data.
58+
> For `region=JP`, the symbol universe is **TSE Prime (domestic stocks)** from JPX listed companies file.
5859
1. download data to csv: `python scripts/data_collector/yahoo/collector.py download_data`
5960

6061
This will download the raw data such as high, low, open, close, adjclose price from yahoo to a local directory. One file per symbol.
@@ -63,7 +64,8 @@ pip install -r requirements.txt
6364
- `source_dir`: save the directory
6465
- `interval`: `1d` or `1min`, by default `1d`
6566
> **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`**
66-
- `region`: `CN` or `US` or `IN` or `BR`, by default `CN`
67+
- `region`: `CN` or `US` or `IN` or `BR` or `JP`, by default `CN`
68+
> `JP` supports `1d` only
6769
- `delay`: `time.sleep(delay)`, by default *0.5*
6870
- `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)*
6971
- `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)*
@@ -92,6 +94,9 @@ pip install -r requirements.txt
9294
python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR
9395
# br 1min data
9496
python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR
97+
98+
# jp 1d data (TSE Prime domestic stocks)
99+
python collector.py download_data --source_dir ~/.qlib/stock_data/source/jp_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region JP
95100
```
96101
2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data`
97102
@@ -105,7 +110,8 @@ pip install -r requirements.txt
105110
- `max_workers`: number of concurrent, by default *1*
106111
- `interval`: `1d` or `1min`, by default `1d`
107112
> if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None`
108-
- `region`: `CN` or `US` or `IN`, by default `CN`
113+
- `region`: `CN` or `US` or `IN` or `BR` or `JP`, by default `CN`
114+
> `JP` supports `1d` only
109115
- `date_field_name`: column *name* identifying time in csv files, by default `date`
110116
- `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
111117
- `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None`
@@ -133,6 +139,9 @@ pip install -r requirements.txt
133139
134140
# normalize 1min br
135141
python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min
142+
143+
# normalize 1d jp
144+
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/jp_data --normalize_dir ~/.qlib/stock_data/source/jp_1d_nor --region JP --interval 1d
136145
```
137146
3. dump data: `python scripts/dump_bin.py dump_all`
138147
@@ -222,4 +231,3 @@ pip install -r requirements.txt
222231
# get all symbol data
223232
# df = D.features(D.instruments("all"), ["$close"], freq="1min")
224233
```
225-

scripts/data_collector/yahoo/collector.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
get_us_stock_symbols,
3939
get_in_stock_symbols,
4040
get_br_stock_symbols,
41+
get_jp_stock_symbols,
4142
generate_minutes_calendar_from_daily,
4243
calc_adjusted_price,
4344
)
@@ -364,6 +365,33 @@ class YahooCollectorBR1min(YahooCollectorBR):
364365
retry = 2
365366

366367

368+
class YahooCollectorJP(YahooCollector, ABC):
369+
def get_instrument_list(self):
370+
logger.info("get JP Prime (domestic stock) symbols......")
371+
symbols = get_jp_stock_symbols()
372+
logger.info(f"get {len(symbols)} symbols.")
373+
return symbols
374+
375+
def download_index_data(self):
376+
pass
377+
378+
def normalize_symbol(self, symbol):
379+
return code_to_fname(symbol).upper()
380+
381+
@property
382+
def _timezone(self):
383+
return "Asia/Tokyo"
384+
385+
386+
class YahooCollectorJP1d(YahooCollectorJP):
387+
pass
388+
389+
390+
class YahooCollectorJP1min(YahooCollectorJP):
391+
def __init__(self, *args, **kwargs):
392+
raise ValueError("JP region does not support 1min data collection")
393+
394+
367395
class YahooNormalize(BaseNormalize):
368396
COLUMNS = ["open", "close", "high", "low", "volume"]
369397
DAILY_FORMAT = "%Y-%m-%d"
@@ -720,6 +748,27 @@ def symbol_to_yahoo(self, symbol):
720748
return fname_to_code(symbol)
721749

722750

751+
class YahooNormalizeJP:
752+
def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
753+
return get_calendar_list("JP_ALL")
754+
755+
756+
class YahooNormalizeJP1d(YahooNormalizeJP, YahooNormalize1d):
757+
pass
758+
759+
760+
class YahooNormalizeJP1dExtend(YahooNormalizeJP, YahooNormalize1dExtend):
761+
pass
762+
763+
764+
class YahooNormalizeJP1min(YahooNormalizeJP, YahooNormalize1min):
765+
def __init__(self, *args, **kwargs):
766+
raise ValueError("JP region does not support 1min normalization")
767+
768+
def symbol_to_yahoo(self, symbol):
769+
return fname_to_code(symbol)
770+
771+
723772
class Run(BaseRun):
724773
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
725774
"""
@@ -735,11 +784,15 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval=
735784
interval: str
736785
freq, value from [1min, 1d], default 1d
737786
region: str
738-
region, value from ["CN", "US", "BR"], default "CN"
787+
region, value from ["CN", "US", "IN", "BR", "JP"], default "CN"
739788
"""
740789
super().__init__(source_dir, normalize_dir, max_workers, interval)
741790
self.region = region
742791

792+
def _validate_region_interval(self):
793+
if self.region.upper() == "JP" and self.interval.lower() == "1min":
794+
raise ValueError("JP region does not support 1min data")
795+
743796
@property
744797
def collector_class_name(self):
745798
return f"YahooCollector{self.region.upper()}{self.interval}"
@@ -792,6 +845,7 @@ def download_data(
792845
# get 1m data
793846
$ python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1m
794847
"""
848+
self._validate_region_interval()
795849
if self.interval == "1d" and pd.Timestamp(end) > pd.Timestamp(datetime.datetime.now().strftime("%Y-%m-%d")):
796850
raise ValueError(f"end_date: {end} is greater than the current date.")
797851

@@ -828,6 +882,7 @@ def normalize_data(
828882
$ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region cn --interval 1d
829883
$ python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data --source_dir ~/.qlib/stock_data/source_cn_1min --normalize_dir ~/.qlib/stock_data/normalize_cn_1min --region CN --interval 1min
830884
"""
885+
self._validate_region_interval()
831886
if self.interval.lower() == "1min":
832887
if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists():
833888
raise ValueError(
@@ -937,6 +992,7 @@ def update_data_to_bin(
937992
check_data_length: int = None,
938993
delay: float = 1,
939994
exists_skip: bool = False,
995+
limit_nums: int = None,
940996
):
941997
"""update yahoo data to bin
942998
@@ -953,6 +1009,8 @@ def update_data_to_bin(
9531009
time.sleep(delay), default 1
9541010
exists_skip: bool
9551011
exists skip, by default False
1012+
limit_nums: int
1013+
using for debug, by default None
9561014
Notes
9571015
-----
9581016
If the data in qlib_data_dir is incomplete, np.nan will be populated to trading_date for the previous trading day
@@ -981,7 +1039,13 @@ def update_data_to_bin(
9811039

9821040
# download data from yahoo
9831041
# NOTE: when downloading data from YahooFinance, max_workers is recommended to be 1
984-
self.download_data(delay=delay, start=trading_date, end=end_date, check_data_length=check_data_length)
1042+
self.download_data(
1043+
delay=delay,
1044+
start=trading_date,
1045+
end=end_date,
1046+
check_data_length=check_data_length,
1047+
limit_nums=limit_nums,
1048+
)
9851049
# NOTE: a larger max_workers setting here would be faster
9861050
self.max_workers = (
9871051
max(multiprocessing.cpu_count() - 2, 1)

0 commit comments

Comments
 (0)