Skip to content

Commit 198a8f4

Browse files
authored
Merge branch 'main' into process_report
2 parents f6b06e1 + 121c971 commit 198a8f4

22 files changed

+4574
-162
lines changed

.github/workflows/1-fetch.yml

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,10 @@ name: Fetch Data
33
on:
44
schedule:
55
# Normal schedule
6-
# # at 01:15 on all days in first month of each quarter
7-
# - cron: '15 1 * 1,4,7,10 *'
8-
# # at 01:15 on days 1-14 in second month of each quarter
9-
# - cron: '15 1 1-14 2,5,8,11 *'
10-
# Temp schedule
11-
# at 01:15 on all days in all months
12-
- cron: '15 1 * * *'
6+
# # at 03:15 on all days in first month of each quarter
7+
- cron: '15 3 * 1,4,7,10 *'
8+
# # at 03:15 on days 1-14 in second month of each quarter
9+
- cron: '15 3 1-14 2,5,8,11 *'
1310

1411
workflow_dispatch:
1512

@@ -29,7 +26,7 @@ jobs:
2926
git config --global user.email "${{ secrets.BOT_EMAIL }}"
3027
3128
- name: Checkout repository
32-
uses: actions/checkout@v4
29+
uses: actions/checkout@v6
3330
with:
3431
# Default fetch-depth is 1, however that value results in errors
3532
# when GitPython attempts to push changes:
@@ -38,7 +35,7 @@ jobs:
3835
token: ${{ secrets.BOT_TOKEN }}
3936

4037
- name: Set up Python
41-
uses: actions/setup-python@v5
38+
uses: actions/setup-python@v6
4239
with:
4340
python-version: '3.11'
4441

@@ -50,20 +47,45 @@ jobs:
5047
run: |
5148
pipenv sync --system
5249
53-
# CC Technology team members:
54-
# See cc-quantifying-bot Google Workspace entry in Bitwarden for
55-
# information on GCS_ secrets
56-
- name: Fetch from Google Custom Search (GCS)
57-
run: |
58-
./scripts/1-fetch/gcs_fetch.py \
59-
--limit=100 --enable-save --enable-git
60-
env:
61-
GCS_DEVELOPER_KEY: ${{ secrets.GCS_DEVELOPER_KEY }}
62-
GCS_CX: ${{ secrets.GCS_CX }}
50+
# Fetch from arXiv disabled due to long run time (~6 hours)
51+
#
52+
# For now, data is fetched manually :/
53+
54+
# Fetch from Europeana disabled due to being considered incomplete
55+
# https://github.com/creativecommons/quantifying/issues/224
56+
57+
# Fetch from GCS disabled due to Google blocking GitHub Action runners
58+
# # CC Technology team members:
59+
# # See cc-quantifying-bot Google Workspace entry in Bitwarden for
60+
# # information on GCS_ secrets
61+
# - name: Fetch from Google Custom Search (GCS)
62+
# run: |
63+
# ./scripts/1-fetch/gcs_fetch.py \
64+
# --limit=100 --enable-save --enable-git
65+
# env:
66+
# GCS_DEVELOPER_KEY: ${{ secrets.GCS_DEVELOPER_KEY }}
67+
# GCS_CX: ${{ secrets.GCS_CX }}
68+
#
69+
# For now, data is fetched manually :/
6370

6471
- name: Fetch from GitHub
6572
run: |
6673
./scripts/1-fetch/github_fetch.py \
6774
--enable-save --enable-git
6875
env:
6976
GH_TOKEN: ${{ secrets.BOT_TOKEN }}
77+
78+
# Fetch from Openverse disabled due to limitations of anonymous API
79+
# access
80+
81+
- name: Fetch from Smithsonian
82+
run: |
83+
./scripts/1-fetch/smithsonian_fetch.py \
84+
--enable-save --enable-git
85+
env:
86+
DATA_GOV_API_KEY: ${{ secrets.DATA_GOV_API_KEY }}
87+
88+
- name: Fetch from Wikipedia
89+
run: |
90+
./scripts/1-fetch/wikipedia_fetch.py \
91+
--enable-save --enable-git
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"TOOL_IDENTIFIER","COUNT"
2+
"CC BY 3.0","7913"
3+
"CC BY 4.0","485432"
4+
"CC BY-NC-ND 4.0","76372"
5+
"CC BY-NC-SA 3.0","5874"
6+
"CC BY-NC-SA 4.0","57750"
7+
"CC BY-SA 4.0","26477"
8+
"CC0 1.0","19617"

data/2026Q1/1-fetch/arxiv_2_count_by_category_report.csv

Lines changed: 1076 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"TOOL_IDENTIFIER","YEAR","COUNT"
2+
"CC BY 3.0","2008","3"
3+
"CC BY 3.0","2009","878"
4+
"CC BY 3.0","2010","574"
5+
"CC BY 3.0","2011","640"
6+
"CC BY 3.0","2012","899"
7+
"CC BY 3.0","2013","1173"
8+
"CC BY 3.0","2014","1376"
9+
"CC BY 3.0","2015","1817"
10+
"CC BY 3.0","2016","271"
11+
"CC BY 3.0","2017","73"
12+
"CC BY 3.0","2018","43"
13+
"CC BY 3.0","2019","114"
14+
"CC BY 3.0","2020","15"
15+
"CC BY 3.0","2021","7"
16+
"CC BY 3.0","2022","7"
17+
"CC BY 3.0","2023","14"
18+
"CC BY 3.0","2024","6"
19+
"CC BY 3.0","2025","3"
20+
"CC BY 4.0","2015","572"
21+
"CC BY 4.0","2016","1507"
22+
"CC BY 4.0","2017","2083"
23+
"CC BY 4.0","2018","3897"
24+
"CC BY 4.0","2019","5051"
25+
"CC BY 4.0","2020","10676"
26+
"CC BY 4.0","2021","48129"
27+
"CC BY 4.0","2022","63751"
28+
"CC BY 4.0","2023","78739"
29+
"CC BY 4.0","2024","99166"
30+
"CC BY 4.0","2025","150068"
31+
"CC BY 4.0","2026","21793"
32+
"CC BY-NC-ND 4.0","2020","770"
33+
"CC BY-NC-ND 4.0","2021","9070"
34+
"CC BY-NC-ND 4.0","2022","10792"
35+
"CC BY-NC-ND 4.0","2023","12851"
36+
"CC BY-NC-ND 4.0","2024","16855"
37+
"CC BY-NC-ND 4.0","2025","23070"
38+
"CC BY-NC-ND 4.0","2026","2964"
39+
"CC BY-NC-SA 3.0","2008","30"
40+
"CC BY-NC-SA 3.0","2009","631"
41+
"CC BY-NC-SA 3.0","2010","693"
42+
"CC BY-NC-SA 3.0","2011","552"
43+
"CC BY-NC-SA 3.0","2012","659"
44+
"CC BY-NC-SA 3.0","2013","847"
45+
"CC BY-NC-SA 3.0","2014","991"
46+
"CC BY-NC-SA 3.0","2015","1162"
47+
"CC BY-NC-SA 3.0","2016","114"
48+
"CC BY-NC-SA 3.0","2017","38"
49+
"CC BY-NC-SA 3.0","2018","27"
50+
"CC BY-NC-SA 3.0","2019","73"
51+
"CC BY-NC-SA 3.0","2020","13"
52+
"CC BY-NC-SA 3.0","2021","16"
53+
"CC BY-NC-SA 3.0","2022","10"
54+
"CC BY-NC-SA 3.0","2023","14"
55+
"CC BY-NC-SA 3.0","2024","3"
56+
"CC BY-NC-SA 3.0","2025","1"
57+
"CC BY-NC-SA 4.0","2015","348"
58+
"CC BY-NC-SA 4.0","2016","837"
59+
"CC BY-NC-SA 4.0","2017","809"
60+
"CC BY-NC-SA 4.0","2018","1585"
61+
"CC BY-NC-SA 4.0","2019","2092"
62+
"CC BY-NC-SA 4.0","2020","3115"
63+
"CC BY-NC-SA 4.0","2021","5560"
64+
"CC BY-NC-SA 4.0","2022","6557"
65+
"CC BY-NC-SA 4.0","2023","8230"
66+
"CC BY-NC-SA 4.0","2024","10997"
67+
"CC BY-NC-SA 4.0","2025","15624"
68+
"CC BY-NC-SA 4.0","2026","1996"
69+
"CC BY-SA 4.0","2015","102"
70+
"CC BY-SA 4.0","2016","300"
71+
"CC BY-SA 4.0","2017","304"
72+
"CC BY-SA 4.0","2018","540"
73+
"CC BY-SA 4.0","2019","699"
74+
"CC BY-SA 4.0","2020","1031"
75+
"CC BY-SA 4.0","2021","2573"
76+
"CC BY-SA 4.0","2022","3145"
77+
"CC BY-SA 4.0","2023","3917"
78+
"CC BY-SA 4.0","2024","5306"
79+
"CC BY-SA 4.0","2025","7512"
80+
"CC BY-SA 4.0","2026","1048"
81+
"CC0 1.0","2015","143"
82+
"CC0 1.0","2016","374"
83+
"CC0 1.0","2017","378"
84+
"CC0 1.0","2018","630"
85+
"CC0 1.0","2019","761"
86+
"CC0 1.0","2020","1127"
87+
"CC0 1.0","2021","2411"
88+
"CC0 1.0","2022","2725"
89+
"CC0 1.0","2023","3017"
90+
"CC0 1.0","2024","3371"
91+
"CC0 1.0","2025","4155"
92+
"CC0 1.0","2026","525"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"TOOL_IDENTIFIER","AUTHOR_BUCKET","COUNT"
2+
"CC BY 3.0","1","2045"
3+
"CC BY 3.0","2","2024"
4+
"CC BY 3.0","3","1620"
5+
"CC BY 3.0","4","842"
6+
"CC BY 3.0","5+","1382"
7+
"CC BY 4.0","1","63398"
8+
"CC BY 4.0","2","93597"
9+
"CC BY 4.0","3","98278"
10+
"CC BY 4.0","4","73517"
11+
"CC BY 4.0","5+","156642"
12+
"CC BY-NC-ND 4.0","1","8480"
13+
"CC BY-NC-ND 4.0","2","14156"
14+
"CC BY-NC-ND 4.0","3","15155"
15+
"CC BY-NC-ND 4.0","4","11935"
16+
"CC BY-NC-ND 4.0","5+","26646"
17+
"CC BY-NC-SA 3.0","1","1560"
18+
"CC BY-NC-SA 3.0","2","1586"
19+
"CC BY-NC-SA 3.0","3","1225"
20+
"CC BY-NC-SA 3.0","4","680"
21+
"CC BY-NC-SA 3.0","5+","823"
22+
"CC BY-NC-SA 4.0","1","6549"
23+
"CC BY-NC-SA 4.0","2","9892"
24+
"CC BY-NC-SA 4.0","3","11253"
25+
"CC BY-NC-SA 4.0","4","9143"
26+
"CC BY-NC-SA 4.0","5+","20913"
27+
"CC BY-SA 4.0","1","3748"
28+
"CC BY-SA 4.0","2","5108"
29+
"CC BY-SA 4.0","3","5396"
30+
"CC BY-SA 4.0","4","4043"
31+
"CC BY-SA 4.0","5+","8182"
32+
"CC0 1.0","1","3525"
33+
"CC0 1.0","2","4421"
34+
"CC0 1.0","3","4121"
35+
"CC0 1.0","4","2681"
36+
"CC0 1.0","5+","4869"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
api_description: Open Archives Initiative Protocol for Metadata Havesting (OAI-PMH)
2+
api_endpoint: https://oaipmh.arxiv.org/oai
3+
cc_articles_found: 679435
4+
fetch_limit: -1
5+
from_add_date: '2008-02-05'
6+
quarter: 2026Q1
7+
script: arxiv_fetch.py

0 commit comments

Comments
 (0)