Paper-to-Code/src/paper_to_code.py at main · pedroosodrac/Paper-to-Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import re
import openai
import PyPDF2
import requests
from tqdm import tqdm
from io import BytesIO


# Prompts
system_prompt = "You are a useful helper. You specialize in natural language processing, programming, and troubleshooting. You know how important the jobs you participate in are, and you dedicate yourself to being a perfectionist and attentive to detail."
base_paper_prompt = "Rewrite the presented approach with correct, academic grammar. Write as much of each proposed detail as possible and how to apply it. Technical and mathematical details are crucial. The generated text needs to be expository and informative. Do not remove important information. Remain unbiased, without critical or opinionated comments. Your text needs to expose the idea, explain it technically and every detail about how it works. The advantages of the approach don't matter, only how you apply it matters."
base_code_prompt = "Below is the text of an approach described in detail, as well as the code that needs that approach applied to it. This approach can be applied to code and accurately describes how this can be done. First, write how you will apply the approach in a nutshell. Then rewrite the code adding and changing as needed to include all the specifics of the approach. Not only, add parameters in the functions so that they are flexible. Don't describe how to apply the approach, just write the code. Ensures you are only using declared variables. The most important thing is to ensure that all the code is written, with no incomplete or missing parts."


class GPTTextGenerator:
    def __init__(self, gpt_model: str = "gpt-3.5-turbo-16k", temperature: float = 0, max_tokens: int = 4096,
                 top_p: float = 0, frequency_penalty: float = 0, presence_penalty: float = 0):
        """
        Initializes the GPTTextGenerator class.

        Args:
            gpt_model (str): Name of the GPT model. Default: "text-davinci-003".
            temperature (float): Degree of randomness. Default: 0.
            max_tokens (int): Maximum response length in tokens. Default: 256.
            top_p (float): Diversity of response by capping cumulative probability. Default: 0.
            frequency_penalty (float): Controls repetition. Default: 0.
            presence_penalty (float): Controls relevance to input. Default: 0.
        """
        self.gpt_model = gpt_model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.top_p = top_p
        self.frequency_penalty = frequency_penalty
        self.presence_penalty = presence_penalty

    def generate(self, prompt: str) -> str:
        """
        Generates a response to the prompt using GPT API.

        Args:
            prompt (str): Input prompt.

        Returns:
            str: Generated response or empty string on error.
        """
        try:
            # Import the OpenAI library and initialize the client
            client = openai.OpenAI()

            #
            response = client.chat.completions.create(
                model=self.gpt_model,
                messages=[
                    { "role": "system", "content": system_prompt},
                    { "role": "user", "content": prompt}
                ],
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                top_p=self.top_p,
                frequency_penalty=self.frequency_penalty,
                presence_penalty=self.presence_penalty
            )

            return response.choices[0].message.content.strip()

        except Exception as e:
            print(f"Error: {e}")
            return ""


def fetch_pdf_text(pdf_url: str) -> str:
    """
    Fetches the text content from a PDF URL.

    Args:
        pdf_url (str): The URL of the PDF.

    Returns:
        str: The extracted text from the PDF.
    """
    # Send a request to the PDF URL and retrieve the PDF content as bytes
    response = requests.get(pdf_url)
    pdf_bytes = BytesIO(response.content)

    # Extract text from the PDF using PyPDF2
    pdf_text = ''
    pdf_reader = PyPDF2.PdfReader(pdf_bytes)

    # Iterate through each page and extract text
    for page in tqdm(pdf_reader.pages):
        pdf_text += page.extract_text()

    return pdf_text


def extract_and_filter_text(text: str, start_marker: str = "abstract", end_marker: str = "references",
                            apply_filter: bool = False, min_chars: int = 2) -> str:
    """
    Extracts and optionally filters text between specified markers from the input text.

    Args:
        text (str): The input text.
        start_marker (str, optional): From where the text should be extracted. Defaults to "abstract".
        end_marker (str, optional): The marker indicating the end of the text to extract. Defaults to "references".
        apply_filter (bool, optional): Whether to apply alphabetic line filtering. Defaults to False.
        min_chars (int, optional): Number of alphabetic characters required per line for filtering. Defaults to 2.

    Returns:
        str: The text extracted between the specified markers, optionally filtered based on the conditions.
    """
    start_found = False
    extracted_text = ''
    lines = text.splitlines(True)

    for line in tqdm(lines):
        if not start_found:
            if start_marker.lower() in line.lower():
                start_found = True
        elif end_marker.lower() in line.lower():
            break
        elif not apply_filter or (apply_filter and sum(c.isalpha() for c in line) >= min_chars):
            extracted_text += line

    return extracted_text


def clean_non_alphabetic_items(text: str) -> str:
    """
    Removes non-alphabetic items (excluding digits) from the input text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with non-alphabetic items removed.
    """
    items = text.split()
    return ' '.join([item for item in tqdm(items) if any(c.isalpha() for c in item) or item.isdigit()])


def replace_urls_with_placeholder(text: str) -> str:
    """
    Replaces URLs in the input text with a placeholder.

    Args:
        text (str): The input text.

    Returns:
        str: The text with URLs replaced by a placeholder.
    """
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(url_pattern, 'link', text)


def start_of_code(code_text: str, start_keyword: str = "import") -> str:
    """
    Returns the portion of code starting from a specified keyword.

    Args:
        code_text (str): The input code text.
        start_keyword (str, optional): The keyword to start from. Defaults to "import".

    Returns:
        str: The code text starting from the keyword.
    """
    code_lines = code_text.splitlines(True)
    start_found = False
    final_code = ""

    # Iterate through lines and extract code starting from the keyword
    for line in tqdm(code_lines):
        if line.startswith(start_keyword):
            start_found = True
        if start_found:
            final_code += line
    return final_code


def generate_updated_code(pdf_url: str, target_file: str, final_name: str,
                          start_marker: str = "abstract", end_marker: str = "references",
                          min_chars: int = 2, apply_filter: bool = True, start_keyword: str = "import",
                          gpt_model: str = "gpt-3.5-turbo-16k", temperature: float = 0,
                          max_tokens: int = 4096, top_p: float = 0, frequency_penalty: float = 0,
                          presence_penalty: float = 0) -> str:
    """
    Generates and writes updated code based on a PDF paper and existing code.

    Args:
        pdf_url (str): URL of the PDF paper.
        target_file (str): Path to the target code file.
        final_name (str): Path to the final updated code file.
        start_marker (str, optional): The starting marker for text extraction. Defaults to "abstract".
        end_marker (str, optional): The ending marker for text extraction. Defaults to "references".
        min_chars (int, optional): The minimum number of alphabetic characters in filtered lines. Defaults to 2.
        apply_filter (bool, optional): Whether to apply alphabetic line filtering. Defaults to True.
        start_keyword (str, optional): The keyword to start code extraction. Defaults to "import".
        gpt_model (str, optional): Name of the GPT model. Defaults to "gpt-3.5-turbo-16k".
        temperature (float, optional): Degree of randomness for GPT response. Defaults to 0.
        max_tokens (int, optional): Maximum response length in tokens. Defaults to 4096.
        top_p (float, optional): Diversity of response by capping cumulative probability. Defaults to 0.
        frequency_penalty (float, optional): Controls repetition in GPT response. Defaults to 0.
        presence_penalty (float, optional): Controls relevance to input in GPT response. Defaults to 0.

    Returns:
        str: The generated updated code.
    """
    # Fetch text content from the PDF URL
    pdf_text = fetch_pdf_text(pdf_url)

    # Extract the text between the specified headings
    extracted_text = extract_and_filter_text(pdf_text, start_marker=start_marker, end_marker=end_marker,
                                             apply_filter=apply_filter, min_chars=min_chars)

    # Remove non-alphabetic and non-alphabetic items from text
    cleaned_text = clean_non_alphabetic_items(extracted_text)

    # Replace URLs with a placeholder ("link") in the text
    paper_text = replace_urls_with_placeholder(cleaned_text)

    # Create the prompt for the paper generation
    paper_prompt = f"{base_paper_prompt}\n\n{paper_text}"

    # Initialize the GPTTextGenerator with provided parameters
    model = GPTTextGenerator(gpt_model=gpt_model, temperature=temperature, max_tokens=max_tokens,
                             top_p=top_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty)

    # Generate the rewritten approach text for the paper
    approach_text = model.generate(paper_prompt).replace("\n", " ")

    # Open and read the code of the target file
    with open(target_file, "r") as f:
        code_text = f.read()

    # Remove excessive newlines (consecutive newlines) from the code text
    code_text = re.sub(r'\n+', '\n', code_text)

    # Create the prompt for the code generation
    code_prompt = f"{base_code_prompt}\n\nThe code:\n\n{code_text}\n\nThe approach:\n\n{approach_text}"

    # Generate the updated code with the integrated approach
    updated_code = model.generate(code_prompt)

    # Get the relevant portion of the updated code
    final_code = start_of_code(updated_code, start_keyword=start_keyword)

    # Write the updated code to the final file
    with open(final_name, "w") as f:
        f.write(final_code)

    return final_code