numerical_reasoning/generate_dataset/generate_task_2.py at main · markxio/numerical_reasoning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from google import genai
from google.genai.types import GenerateContentConfig
from pydantic import BaseModel
import os
import json
from tqdm import tqdm
import pandas as pd

from prompts import generate_multi_hop_qualitative, generate_multi_hop_quantitative
from generate_questions import get_unprocessed_files_only

cols_percent = [
    "json_file_path",
    "year",
    "ticker_id",
    "item",
    "sentence_id",
    "answer_labels",
    "sentence",
    "question",
    "answer",
    "has_percent_sign_twice",
    "has_percent_sign_thrice",
    "has_two_years",
    "has_three_years"
]

cols_money = [
    "json_file_path",
    "year",
    "ticker_id",
    "item",
    "sentence_id",
    "answer_labels",
    "sentence",
    "question",
    "answer",
    "has_dollar_sign_twice",
    "has_dollar_sign_thrice",
    "has_two_years",
    "has_three_years"
]

with open("google_api_key.json") as f:
    API_KEY = json.load(f)["API_KEY"]

def main():

    with open("config.json") as f:
        config = json.load(f)["generate_task_2"]

    print("-------------------------------")
    print("---------- config -------------")
    print("-------------------------------")
    pprint(config)
    print("-------------------------------")
    print("-------------------------------")
    print("-------------------------------")

    #########################
    # set config args starts
    #########################

    json_response_dir = config['json_response_dir']
    model_name = config['model_name']
    # TODO: implement check
    check_already_processed = config['check_already_processed']

    #########################
    # set config args ends
    #########################

    os.system(f"rm -rf {json_response_dir}")
    os.system(f"mkdir -p {json_response_dir}/quantitative")
    os.system(f"mkdir -p {json_response_dir}/qualitative")

    # for now hardcoded the two labels MONEY and PERCENT
    # TODO: label list as config arg
    # currently hardcoded csv file
    # TODO: add csv input file as config arg
    money = pd.read_csv("FILTER_label_MONEY.csv", sep=";", usecols=cols_money)

    money2 = money[money["has_two_years"]==True]
    money2 = money2[money2["has_dollar_sign_twice"]==True]

    money3 = money[money["has_three_years"]==True]
    money3 = money3[money3["has_dollar_sign_thrice"]==True]

    money = pd.concat([money2, money3], ignore_index=True)

    percent = pd.read_csv("FILTER_label_PERCENT.csv", sep=";", usecols=cols_percent)

    percent2 = percent[percent["has_percent_sign_twice"]==True]
    percent2 = percent2[percent2["has_two_years"]==True]

    percent3 = percent[percent["has_percent_sign_thrice"]==True]
    percent3 = percent3[percent3["has_three_years"]==True]

    percent = pd.concat([percent2, percent3], ignore_index=True)

    dfs = {
        "money": money,
        "percent": percent,
    }

    dfs_output = {
        "money": dfs["money"].copy(),
        "percent": dfs["percent"].copy(),
    }

    system_instructions = {}
    system_instructions["qualitative"] = generate_multi_hop_qualitative
    system_instructions["quantitative"] = generate_multi_hop_quantitative

    for label, df in dfs.items():
        answers = {}
        for json_file_path, sentence in zip(tqdm(df["json_file_path"].tolist()), df["sentence"].tolist()):
            for answer_type, sys_instruct in system_instructions.items():
                response = send_request_gemini(system_instruction=sys_instruct, user_content=sentence, config=config)
                if answer_type not in answers:
                    answers[answer_type]=[]
                current_list = answers[answer_type]
                current_list.append(response["change"])
                answers[answer_type] = current_list

                # quickly write json to file
                filename=json_file_path.split("/")[-1]
                with open(f"{json_response_dir}/{answer_type}/{filename}", "w") as f:
                    json.dump(response, f)

        new_df = dfs_output[label].copy()
        for answer_type, sys_instruct in system_instructions.items():
            # write to file
            #if answer_type in answers:
            new_df[f"answer_2_{answer_type}"] = answers[answer_type] # only works for continuous indices
        new_df.to_csv(f"FILTER_label_{label}_task_2_answers_{model_name}.csv", sep=";", index=False)

class QuestionAnswer(BaseModel):
    change: str

def send_request_gemini(system_instruction, user_content, config):
    client = genai.Client(api_key=API_KEY)

    # https://ai.google.dev/gemini-api/docs/models

    # 2.5 Pro
    # Our most powerful thinking model with maximum response accuracy
    # and state-of-the-art performance

    # 1000 requests per day maximum with my Tier1 key.
    model_name=config['model_name']

    # 2.5 Flash
    # Our best model in terms of price-performance,
    # offering well-rounded  capabilities.

    #model_name="gemini-2.5-flash-preview-04-17"
    #model_name="gemini-2.5-flash-preview-05-20" # new

    response = client.models.generate_content(
        contents=user_content,
        model=model_name,
        config=GenerateContentConfig(
            system_instruction = system_instruction,
            response_mime_type = "application/json",
            response_schema = QuestionAnswer
        ),
        #config=types.GenerateContentConfig(
        #    max_output_tokens=10,
        #    temperature=0.5
        #)
    )

    return response.text

if __name__=="__main__":
    main()