diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index 3e072cfa0a..bd9db48b45 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -862,7 +862,7 @@ Use `hf spaces` to list Spaces on the Hub and get detailed information about a s ## hf papers -Use `hf papers` to list daily papers on the Hub. +Use `hf papers` to list, search, get structured info, and read the markdown content of papers on the Hub. ### List papers @@ -879,10 +879,46 @@ Use `hf papers` to list daily papers on the Hub. # List today's papers >>> hf papers ls --date=today +# List papers from a specific week +>>> hf papers ls --week=2025-W09 + +# List papers from a specific month +>>> hf papers ls --month=2025-02 + +# List papers submitted by a specific user +>>> hf papers ls --submitter=akhaliq + # Limit results >>> hf papers ls --sort=trending --limit=5 ``` +### Search papers + +```bash +# Search papers by keyword +>>> hf papers search "vision language" + +# Limit search results +>>> hf papers search "diffusion models" --limit=10 + +# Output as JSON +>>> hf papers search "attention" --format=json +``` + +### Get paper info + +```bash +# Get structured metadata for a paper (returns JSON) +>>> hf papers info 2601.15621 +``` + +### Read paper as markdown + +```bash +# Read the full paper content as markdown +>>> hf papers read 2601.15621 +``` + ## hf discussions Use `hf discussions` to manage discussions and pull requests on Hub repositories directly from your terminal. You can list, view, create, comment on, close, reopen, and merge both discussions and PRs. For a full guide on how the Hub's community features work, see the [Discussions and Pull Requests guide](./community). diff --git a/docs/source/en/package_reference/cli.md b/docs/source/en/package_reference/cli.md index 6ba53b1db4..4a6fd1fc58 100644 --- a/docs/source/en/package_reference/cli.md +++ b/docs/source/en/package_reference/cli.md @@ -2665,7 +2665,37 @@ $ hf papers [OPTIONS] COMMAND [ARGS]... **Commands**: +* `info`: Get info about a paper on the Hub. * `list`: List daily papers on the Hub. [alias: ls] +* `read`: Read a paper as markdown. +* `search`: Search papers on the Hub. + +### `hf papers info` + +Get info about a paper on the Hub. Output is in JSON format. + +**Usage**: + +```console +$ hf papers info [OPTIONS] PAPER_ID +``` + +**Arguments**: + +* `PAPER_ID`: The arXiv paper ID (e.g. '2502.08025'). [required] + +**Options**: + +* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens. +* `--help`: Show this message and exit. + +Examples + $ hf papers info 2601.15621 + +Learn more + Use `hf --help` for more information about a command. + Read the documentation at https://huggingface.co/docs/huggingface_hub/en/guides/cli + ### `hf papers list` @@ -2680,6 +2710,9 @@ $ hf papers list [OPTIONS] **Options**: * `--date TEXT`: Date in ISO format (YYYY-MM-DD) or 'today'. +* `--week TEXT`: ISO week to filter by, e.g. '2025-W09'. +* `--month TEXT`: Month to filter by in ISO format (YYYY-MM), e.g. '2025-02'. +* `--submitter TEXT`: Filter by username of the submitter. * `--sort [publishedAt|trending]`: Sort results. * `--limit INTEGER`: Limit the number of results. [default: 50] * `--format [table|json]`: Output format (table or json). [default: table] @@ -2691,6 +2724,8 @@ Examples $ hf papers ls $ hf papers ls --sort trending $ hf papers ls --date 2025-01-23 + $ hf papers ls --week 2025-W09 + $ hf papers ls --submitter akhaliq $ hf papers ls --format json Learn more @@ -2698,6 +2733,65 @@ Learn more Read the documentation at https://huggingface.co/docs/huggingface_hub/en/guides/cli +### `hf papers read` + +Read a paper as markdown. + +**Usage**: + +```console +$ hf papers read [OPTIONS] PAPER_ID +``` + +**Arguments**: + +* `PAPER_ID`: The arXiv paper ID (e.g. '2502.08025'). [required] + +**Options**: + +* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens. +* `--help`: Show this message and exit. + +Examples + $ hf papers read 2601.15621 + +Learn more + Use `hf --help` for more information about a command. + Read the documentation at https://huggingface.co/docs/huggingface_hub/en/guides/cli + + +### `hf papers search` + +Search papers on the Hub. + +**Usage**: + +```console +$ hf papers search [OPTIONS] QUERY +``` + +**Arguments**: + +* `QUERY`: Search query string. [required] + +**Options**: + +* `--limit INTEGER`: Limit the number of results. [default: 20] +* `--format [table|json]`: Output format (table or json). [default: table] +* `-q, --quiet`: Print only IDs (one per line). +* `--token TEXT`: A User Access Token generated from https://huggingface.co/settings/tokens. +* `--help`: Show this message and exit. + +Examples + $ hf papers search "vision language" + $ hf papers search "attention mechanism" --limit 10 + $ hf papers search "diffusion" --format json + +Learn more + Use `hf --help` for more information about a command. + Read the documentation at https://huggingface.co/docs/huggingface_hub/en/guides/cli + + ## `hf repos` Manage repos on the Hub. [alias: repo] diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 986af42249..6e91b31cd0 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -299,6 +299,7 @@ "pause_space", "permanently_delete_lfs_files", "preupload_lfs_files", + "read_paper", "reject_access_request", "rename_discussion", "repo_exists", @@ -1024,6 +1025,7 @@ "preupload_lfs_files", "push_to_hub_fastai", "read_dduf_file", + "read_paper", "reject_access_request", "rename_discussion", "repo_exists", @@ -1411,6 +1413,7 @@ def __dir__(): pause_space, # noqa: F401 permanently_delete_lfs_files, # noqa: F401 preupload_lfs_files, # noqa: F401 + read_paper, # noqa: F401 reject_access_request, # noqa: F401 rename_discussion, # noqa: F401 repo_exists, # noqa: F401 diff --git a/src/huggingface_hub/cli/papers.py b/src/huggingface_hub/cli/papers.py index 871e07b9a2..b635483405 100644 --- a/src/huggingface_hub/cli/papers.py +++ b/src/huggingface_hub/cli/papers.py @@ -25,14 +25,31 @@ # list today's papers, ordered by upvotes hf papers ls --date=today + + # list papers from a specific week + hf papers ls --week=2025-W09 + + # list papers by a specific submitter + hf papers ls --submitter=someuser + + # search papers + hf papers search "vision language" + + # get info about a paper + hf papers info 2502.08025 + + # read a paper as markdown + hf papers read 2502.08025 """ import datetime import enum +import json from typing import Annotated, Optional, get_args import typer +from huggingface_hub.errors import CLIError, HfHubHTTPError from huggingface_hub.hf_api import DailyPapersSort_T from ._cli_utils import ( @@ -71,6 +88,8 @@ def _parse_date(value: Optional[str]) -> Optional[str]: "hf papers ls", "hf papers ls --sort trending", "hf papers ls --date 2025-01-23", + "hf papers ls --week 2025-W09", + "hf papers ls --submitter akhaliq", "hf papers ls --format json", ], ) @@ -82,6 +101,18 @@ def papers_ls( callback=_parse_date, ), ] = None, + week: Annotated[ + Optional[str], + typer.Option(help="ISO week to filter by, e.g. '2025-W09'."), + ] = None, + month: Annotated[ + Optional[str], + typer.Option(help="Month to filter by in ISO format (YYYY-MM), e.g. '2025-02'."), + ] = None, + submitter: Annotated[ + Optional[str], + typer.Option(help="Filter by username of the submitter."), + ] = None, sort: Annotated[ Optional[PaperSortEnum], typer.Option(help="Sort results."), @@ -98,6 +129,9 @@ def papers_ls( api_object_to_dict(paper_info) for paper_info in api.list_daily_papers( date=date, + week=week, + month=month, + submitter=submitter, sort=sort_key, limit=limit, ) @@ -106,14 +140,14 @@ def papers_ls( def _paper_row(item: dict) -> list[str]: submitted_by = item.get("submitted_by") or {} - submitter = submitted_by.get("fullname") or submitted_by.get("username") or "" + submitter_name = submitted_by.get("fullname") or submitted_by.get("username") or "" return [ item.get("id", ""), _format_cell(item.get("title", ""), max_len=60), str(item.get("upvotes", "")), str(item.get("comments", "")), _format_cell(item.get("published_at", "")), - submitter, + submitter_name, ] print_list_output( @@ -125,3 +159,84 @@ def _paper_row(item: dict) -> list[str]: row_fn=_paper_row, alignments={"upvotes": "right", "comments": "right"}, ) + + +@papers_cli.command( + "search", + examples=[ + 'hf papers search "vision language"', + 'hf papers search "attention mechanism" --limit 10', + 'hf papers search "diffusion" --format json', + ], +) +def papers_search( + query: Annotated[str, typer.Argument(help="Search query string.")], + limit: LimitOpt = 20, + format: FormatOpt = OutputFormat.table, + quiet: QuietOpt = False, + token: TokenOpt = None, +) -> None: + """Search papers on the Hub.""" + api = get_hf_api(token=token) + results = [api_object_to_dict(paper_info) for paper_info in api.list_papers(query=query, limit=limit)] + _HEADERS = ["id", "title", "upvotes", "published_at"] + + def _paper_row(item: dict) -> list[str]: + return [ + item.get("id", ""), + _format_cell(item.get("title", ""), max_len=70), + str(item.get("upvotes", "")), + _format_cell(item.get("published_at", "")), + ] + + print_list_output( + results, + format=format, + quiet=quiet, + id_key="id", + headers=_HEADERS, + row_fn=_paper_row, + alignments={"upvotes": "right"}, + ) + + +@papers_cli.command( + "info", + examples=[ + "hf papers info 2601.15621", + ], +) +def papers_info( + paper_id: Annotated[str, typer.Argument(help="The arXiv paper ID (e.g. '2502.08025').")], + token: TokenOpt = None, +) -> None: + """Get info about a paper on the Hub. Output is in JSON format.""" + api = get_hf_api(token=token) + try: + info = api.paper_info(id=paper_id) + except HfHubHTTPError as e: + if e.response.status_code == 404: + raise CLIError(f"Paper '{paper_id}' not found on the Hub.") from e + raise + print(json.dumps(api_object_to_dict(info), indent=2)) + + +@papers_cli.command( + "read", + examples=[ + "hf papers read 2601.15621", + ], +) +def papers_read( + paper_id: Annotated[str, typer.Argument(help="The arXiv paper ID (e.g. '2502.08025').")], + token: TokenOpt = None, +) -> None: + """Read a paper as markdown.""" + api = get_hf_api(token=token) + try: + content = api.read_paper(id=paper_id) + except HfHubHTTPError as e: + if e.response.status_code == 404: + raise CLIError(f"Paper '{paper_id}' not found on the Hub.") from e + raise + print(content) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 365ef0bea3..48fea57014 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -10471,6 +10471,26 @@ def paper_info(self, id: str) -> PaperInfo: hf_raise_for_status(r) return PaperInfo(**r.json()) + def read_paper(self, id: str) -> str: + """ + Get the markdown content of a paper page on the Hub. + + Args: + id (`str`): + ArXiv id of the paper. + + Returns: + `str`: The paper page content as markdown. + + Raises: + [`HfHubHTTPError`]: + HTTP 404 If the paper does not exist on the Hub. + """ + path = f"{self.endpoint}/papers/{id}.md" + r = get_session().get(path) + hf_raise_for_status(r) + return r.text + def list_daily_papers( self, *, @@ -12779,6 +12799,7 @@ def get_local_safetensors_metadata(path: Union[str, Path]) -> SafetensorsRepoMet list_papers = api.list_papers paper_info = api.paper_info +read_paper = api.read_paper list_daily_papers = api.list_daily_papers repo_exists = api.repo_exists diff --git a/tests/test_cli.py b/tests/test_cli.py index 79423da790..9df05cd730 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1609,6 +1609,172 @@ def test_datasets_ls_with_sort(self, runner: CliRunner) -> None: assert kwargs["sort"] == "downloads" +class TestPapersCommand: + def _make_paper(self, **kwargs): + from huggingface_hub.hf_api import PaperInfo + + defaults = dict( + id="2502.08025", + title="Attention Is All You Need", + upvotes=42, + numComments=3, + publishedAt="2025-02-12T00:00:00.000Z", + ) + defaults.update(kwargs) + return PaperInfo(**defaults) + + def test_ls_basic(self, runner: CliRunner) -> None: + paper = self._make_paper() + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([paper]) + result = runner.invoke(app, ["papers", "ls", "--format", "json"]) + + assert result.exit_code == 0, result.output + output = json.loads(result.stdout) + assert output[0]["id"] == "2502.08025" + assert output[0]["title"] == "Attention Is All You Need" + + def test_ls_with_sort(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "ls", "--sort", "trending"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_daily_papers.call_args + assert kwargs["sort"] == "trending" + + def test_ls_with_date(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "ls", "--date", "2025-01-23"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_daily_papers.call_args + assert kwargs["date"] == "2025-01-23" + + def test_ls_with_week(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "ls", "--week", "2025-W09"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_daily_papers.call_args + assert kwargs["week"] == "2025-W09" + + def test_ls_with_month(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "ls", "--month", "2025-02"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_daily_papers.call_args + assert kwargs["month"] == "2025-02" + + def test_ls_with_submitter(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "ls", "--submitter", "someuser"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_daily_papers.call_args + assert kwargs["submitter"] == "someuser" + + def test_ls_invalid_sort(self, runner: CliRunner) -> None: + result = runner.invoke(app, ["papers", "ls", "--sort", "invalid"]) + assert result.exit_code == 2 + assert "Invalid value" in result.output + + def test_ls_quiet(self, runner: CliRunner) -> None: + paper = self._make_paper() + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_daily_papers.return_value = iter([paper]) + result = runner.invoke(app, ["papers", "ls", "--quiet"]) + + assert result.exit_code == 0, result.output + assert result.stdout.strip() == "2502.08025" + + def test_search_basic(self, runner: CliRunner) -> None: + paper = self._make_paper() + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_papers.return_value = iter([paper]) + result = runner.invoke(app, ["papers", "search", "attention", "--format", "json"]) + + assert result.exit_code == 0, result.output + output = json.loads(result.stdout) + assert output[0]["id"] == "2502.08025" + _, kwargs = api.list_papers.call_args + assert kwargs["query"] == "attention" + + def test_search_with_limit(self, runner: CliRunner) -> None: + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.list_papers.return_value = iter([]) + result = runner.invoke(app, ["papers", "search", "diffusion", "--limit", "5"]) + + assert result.exit_code == 0, result.output + _, kwargs = api.list_papers.call_args + assert kwargs["limit"] == 5 + + def test_info_basic(self, runner: CliRunner) -> None: + paper = self._make_paper() + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.paper_info.return_value = paper + result = runner.invoke(app, ["papers", "info", "2502.08025"]) + + assert result.exit_code == 0, result.output + output = json.loads(result.stdout) + assert output["id"] == "2502.08025" + api.paper_info.assert_called_once_with(id="2502.08025") + + def test_info_not_found(self, runner: CliRunner) -> None: + from huggingface_hub.errors import CLIError, HfHubHTTPError + + mock_response = Mock() + mock_response.status_code = 404 + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.paper_info.side_effect = HfHubHTTPError("Not found", response=mock_response) + result = runner.invoke(app, ["papers", "info", "0000.00000"]) + + assert result.exit_code == 1 + assert isinstance(result.exception, CLIError) + assert "not found" in str(result.exception).lower() + + def test_read_basic(self, runner: CliRunner) -> None: + markdown = "# Attention Is All You Need\n\nThis paper introduces..." + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.read_paper.return_value = markdown + result = runner.invoke(app, ["papers", "read", "2502.08025"]) + + assert result.exit_code == 0, result.output + assert "Attention Is All You Need" in result.stdout + api.read_paper.assert_called_once_with(id="2502.08025") + + def test_read_not_found(self, runner: CliRunner) -> None: + from huggingface_hub.errors import CLIError, HfHubHTTPError + + mock_response = Mock() + mock_response.status_code = 404 + with patch("huggingface_hub.cli.papers.get_hf_api") as api_cls: + api = api_cls.return_value + api.read_paper.side_effect = HfHubHTTPError("Not found", response=mock_response) + result = runner.invoke(app, ["papers", "read", "0000.00000"]) + + assert result.exit_code == 1 + assert isinstance(result.exception, CLIError) + assert "not found" in str(result.exception).lower() + + class TestDatasetsParquetCommand: def test_datasets_parquet_table_output(self, runner: CliRunner) -> None: with patch("huggingface_hub.cli.datasets.get_hf_api") as api_cls: