Changed the variable name UNIT_NAME to DATA_SOURCE and added the ylabels

oree-xx · oree-xx · commit f6b06e1d5c1a · 2026-02-24T17:52:40.000+01:00
diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py
@@ -40,7 +40,7 @@
 ]
 HEADER_2_UNITS = [
     "UNIT_CODE",
-    "UNIT_NAME",
+    "DATA_SOURCE",
     "CC0_RECORDS",
     "CC0_RECORDS_WITH_CC0_MEDIA",
     "TOTAL_OBJECTS",
@@ -250,7 +250,7 @@ def query_smithsonian(args, session):
         data_units.append(
             {
                 "UNIT_CODE": unit["unit"],
-                "UNIT_NAME": UNIT_MAP.get(unit["unit"], unit["unit"]),
+                "DATA_SOURCE": UNIT_MAP.get(unit["unit"], unit["unit"]),
                 "CC0_RECORDS": unit["metrics"]["CC0_records"],
                 "CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][
                     "CC0_records_with_CC0_media"
diff --git a/scripts/2-process/smithsonian_process.py b/scripts/2-process/smithsonian_process.py
@@ -82,13 +82,13 @@ def process_totals_by_units(args, count_data):
     data = {}
 
     for row in count_data.itertuples(index=False):
-        unit = str(row.UNIT_NAME)
+        unit = str(row.DATA_SOURCE)
         total_objects = int(row.TOTAL_OBJECTS)
 
         data[unit] = total_objects
 
-    data = pd.DataFrame(data.items(), columns=["Unit_name", "Total_objects"])
-    data.sort_values("Unit_name", ascending=True, inplace=True)
+    data = pd.DataFrame(data.items(), columns=["Data_source", "Total_objects"])
+    data.sort_values("Data_source", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
     file_path = shared.path_join(
         PATHS["data_phase"], "smithsonian_totals_by_units.csv"
@@ -104,7 +104,7 @@ def process_totals_by_records(args, count_data):
     data = {}
 
     for row in count_data.itertuples(index=False):
-        unit = str(row.UNIT_NAME)
+        unit = str(row.DATA_SOURCE)
         CC0_records = int(row.CC0_RECORDS)
         CC0_records_with_CC0_media = int(row.CC0_RECORDS_WITH_CC0_MEDIA)
         total_objects = int(row.TOTAL_OBJECTS)
@@ -123,7 +123,7 @@ def process_totals_by_records(args, count_data):
     data = (
         pd.DataFrame.from_dict(data, orient="index")
         .reset_index()
-        .rename(columns={"index": "Unit_name"})
+        .rename(columns={"index": "Data_source"})
     )
     data["CC0_without_media_percentage"] = (
         (
@@ -142,7 +142,7 @@ def process_totals_by_records(args, count_data):
         * 100
     ).round(2)
 
-    data.sort_values("Unit_name", ascending=True, inplace=True)
+    data.sort_values("Data_source", ascending=True, inplace=True)
     data.reset_index(drop=True, inplace=True)
 
     file_path = shared.path_join(
@@ -164,7 +164,7 @@ def main():
         file_count,
         usecols=[
             "UNIT_CODE",
-            "UNIT_NAME",
+            "DATA_SOURCE",
             "CC0_RECORDS",
             "CC0_RECORDS_WITH_CC0_MEDIA",
             "TOTAL_OBJECTS",
diff --git a/scripts/3-report/smithsonian_report.py b/scripts/3-report/smithsonian_report.py
@@ -143,7 +143,7 @@ def plot_totals_by_top10_units(args):
         "smithsonian_totals_by_units.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit_name"
+    name_label = "Data_source"
     data_label = "Total_objects"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data["Total_objects"] = data["Total_objects"].astype(int)
@@ -157,6 +157,7 @@ def plot_totals_by_top10_units(args):
         title=title,
         name_label=name_label,
         data_label=data_label,
+        bar_ylabel="Data Sources",
     )
 
     image_path = shared.path_join(
@@ -193,7 +194,7 @@ def plot_totals_by_lowest10_units(args):
         "smithsonian_totals_by_units.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit_name"
+    name_label = "Data_source"
     data_label = "Total_objects"
     data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data["Total_objects"] = data["Total_objects"].astype(int)
@@ -207,6 +208,7 @@ def plot_totals_by_lowest10_units(args):
         title=title,
         name_label=name_label,
         data_label=data_label,
+        bar_ylabel="Data Sources",
     )
 
     image_path = shared.path_join(
@@ -243,7 +245,7 @@ def plot_totals_by_top10_unit_records(args):
         "smithsonian_totals_by_records.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit_name"
+    name_label = "Data_source"
     data_label = "Total_objects"
     stack_labels = [
         "CC0_without_media_percentage",
@@ -260,6 +262,7 @@ def plot_totals_by_top10_unit_records(args):
         title=title,
         name_label=name_label,
         stack_labels=stack_labels,
+        ylabel="Data Sources",
     )
     image_path = shared.path_join(
         PATHS["data_phase"], "smithsonian_by_top10_unit_records.png"
@@ -293,7 +296,7 @@ def plot_totals_by_lowest10_unit_records(args):
         "smithsonian_totals_by_records.csv",
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Unit_name"
+    name_label = "Data_source"
     data_label = "Total_objects"
     stack_labels = [
         "CC0_without_media_percentage",
@@ -310,6 +313,7 @@ def plot_totals_by_lowest10_unit_records(args):
         title=title,
         name_label=name_label,
         stack_labels=stack_labels,
+        ylabel="Data Sources",
     )
     image_path = shared.path_join(
         PATHS["data_phase"], "smithsonian_by_lowest10_unit_records.png"
diff --git a/scripts/plot.py b/scripts/plot.py
@@ -208,7 +208,7 @@ def stacked_barh_plot(
             for current_left, width in zip(left, data[label])
         ]
 
-    ax.set_xlabel("Number of works")
+    ax.set_xlabel("Percentage of works")
     ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))
     ax.set_yticks(range(len(data.index)))
     ax.set_yticklabels([wrap_label(label) for label in data.index])

Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@ def stacked_barh_plot(`
`208`	`208`	`for current_left, width in zip(left, data[label])`
`209`	`209`	`]`
`210`	`210`
`211`		`- ax.set_xlabel("Number of works")`
	`211`	`+ ax.set_xlabel("Percentage of works")`
`212`	`212`	`ax.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter))`
`213`	`213`	`ax.set_yticks(range(len(data.index)))`
`214`	`214`	`ax.set_yticklabels([wrap_label(label) for label in data.index])`