feat: add rephrasing prompts (#163)

ChenZiHong-Gavin · gemini-code-assist[bot] · web-flow · commit 6a0f682706fc · 2026-01-30T16:46:36.000+08:00
* feat: add rephrasing prompts

* Update graphgen/templates/rephrasing/qa_dialogue_format_rephrasing.py

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

* feat: add rephrasing pipeline

* fix: change chunk params

* fix: fix rephrasers

---------

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/baselines/BDS/bds.py b/baselines/BDS/bds.py
@@ -8,7 +8,7 @@
 from tqdm.asyncio import tqdm as tqdm_async
 
 from graphgen.bases import BaseLLMWrapper
-from graphgen.common import init_llm
+from graphgen.common.init_llm import init_llm
 from graphgen.storage import NetworkXStorage
 from graphgen.utils import create_event_loop
 
diff --git a/examples/generate/generate_aggregated_qa/aggregated_config.yaml b/examples/generate/generate_aggregated_qa/aggregated_config.yaml
@@ -1,7 +1,7 @@
 global_params:
   working_dir: cache
-  graph_backend: kuzu # graph database backend, support: kuzu, networkx
-  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+  graph_backend: networkx # graph database backend, support: kuzu, networkx
+  kv_backend: json_kv # key-value store backend, support: rocksdb, json_kv
 
 nodes:
   - id: read_files # id is unique in the pipeline, and can be referenced by other steps
diff --git a/examples/rephrase/rephrase_style_controlled/README.md b/examples/rephrase/rephrase_style_controlled/README.md
@@ -0,0 +1 @@
+# Rephrase with Style Control
diff --git a/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh b/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
diff --git a/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml b/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml
@@ -0,0 +1,36 @@
+global_params:
+  working_dir: cache
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/json_demo.json
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 2048 # larger chunk size for better context
+      chunk_overlap: 200
+
+  - id: rephrase
+    op_name: rephrase
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true
+    params:
+      method: style_controlled
+      style: critical_analysis
diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py
@@ -7,6 +7,7 @@
 from .base_operator import BaseOperator
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
+from .base_rephraser import BaseRephraser
 from .base_searcher import BaseSearcher
 from .base_splitter import BaseSplitter
 from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace
diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py
@@ -28,7 +28,7 @@ def __init__(
         op_name: str = None,
     ):
         # lazy import to avoid circular import
-        from graphgen.common import init_storage
+        from graphgen.common.init_storage import init_storage
         from graphgen.utils import set_logger
 
         log_dir = os.path.join(working_dir, "logs")
diff --git a/graphgen/bases/base_rephraser.py b/graphgen/bases/base_rephraser.py
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseRephraser(ABC):
+    """
+    Rephrase text based on given prompts.
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""
+
+    @staticmethod
+    @abstractmethod
+    def parse_response(response: str) -> Any:
+        """Parse the LLM response and return the rephrased text"""
+
+    async def rephrase(
+        self,
+        item: dict,
+    ) -> dict:
+        text = item["content"]
+        prompt = self.build_prompt(text)
+        response = await self.llm_client.generate_answer(prompt)
+        return self.parse_response(response)
diff --git a/graphgen/common/__init__.py b/graphgen/common/__init__.py
@@ -1,2 +1,2 @@
-from .init_llm import init_llm
-from .init_storage import init_storage
+# from .init_llm import init_llm
+# from .init_storage import init_storage
diff --git a/graphgen/engine.py b/graphgen/engine.py
@@ -11,7 +11,8 @@
 from ray.data.datasource.filename_provider import FilenameProvider
 
 from graphgen.bases import Config, Node
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.utils import logger
 
 
@@ -70,6 +71,7 @@ def __init__(
 
         if not ray.is_initialized():
             context = ray.init(
+                include_dashboard=True,
                 ignore_reinit_error=True,
                 logging_level=logging.ERROR,
                 log_to_driver=True,
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -37,6 +37,7 @@
     RDFReader,
     TXTReader,
 )
+from .rephraser import StyleControlledRephraser
 from .searcher.db.ncbi_searcher import NCBISearch
 from .searcher.db.rnacentral_searcher import RNACentralSearch
 from .searcher.db.uniprot_searcher import UniProtSearch
diff --git a/graphgen/models/rephraser/__init__.py b/graphgen/models/rephraser/__init__.py
@@ -0,0 +1 @@
+from .style_controlled_rephraser import StyleControlledRephraser
diff --git a/graphgen/models/rephraser/style_controlled_rephraser.py b/graphgen/models/rephraser/style_controlled_rephraser.py
@@ -0,0 +1,32 @@
+from typing import Any, Optional
+
+from graphgen.bases import BaseRephraser
+from graphgen.templates import STYLE_CONTROLLED_REPHRASING_PROMPTS
+from graphgen.utils import detect_main_language, logger
+
+
+class StyleControlledRephraser(BaseRephraser):
+    """
+    Style Controlled Rephraser rephrases the input text based on a specified style.
+    """
+
+    def __init__(self, llm_client: Any, style: str = "critical_analysis"):
+        super().__init__(llm_client)
+        self.style = style
+
+    def build_prompt(self, text: str) -> str:
+        logger.debug("Text to be rephrased: %s", text)
+        language = detect_main_language(text)
+        prompt_template = STYLE_CONTROLLED_REPHRASING_PROMPTS[self.style][language]
+        prompt = prompt_template.format(text=text)
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> Optional[dict]:
+        result = response.strip()
+        logger.debug("Raw rephrased response: %s", result)
+        if not result:
+            return None
+        return {
+            "content": result,
+        }
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -8,6 +8,7 @@
 from .partition import PartitionService
 from .quiz import QuizService
 from .read import read
+from .rephrase import RephraseService
 from .search import SearchService
 
 operators = {
@@ -21,5 +22,6 @@
     "partition": PartitionService,
     "generate": GenerateService,
     "evaluate": EvaluateService,
+    "rephrase": RephraseService,
     "filter": FilterService,
 }
diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py
@@ -2,7 +2,8 @@
 
 from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator
 from graphgen.bases.datatypes import Chunk
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.utils import logger
 
 from .build_mm_kg import build_mm_kg
diff --git a/graphgen/operators/evaluate/evaluate_service.py b/graphgen/operators/evaluate/evaluate_service.py
@@ -1,7 +1,8 @@
 from typing import Tuple
 
 from graphgen.bases import BaseLLMWrapper, BaseOperator
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.utils import logger
 
 from .evaluate_kg import evaluate_kg
diff --git a/graphgen/operators/extract/extract_service.py b/graphgen/operators/extract/extract_service.py
@@ -2,7 +2,7 @@
 from typing import Tuple
 
 from graphgen.bases import BaseLLMWrapper, BaseOperator, Chunk
-from graphgen.common import init_llm
+from graphgen.common.init_llm import init_llm
 from graphgen.models.extractor import SchemaGuidedExtractor
 from graphgen.utils import logger, run_concurrent
 
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -1,6 +1,8 @@
 from typing import Tuple
+
 from graphgen.bases import BaseKVStorage, BaseLLMWrapper, BaseOperator
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.utils import logger, run_concurrent
 
 
diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py
@@ -1,8 +1,9 @@
-from typing import Tuple
 import math
+from typing import Tuple
 
 from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
 from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
 
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
@@ -2,7 +2,7 @@
 from typing import Iterable, Tuple
 
 from graphgen.bases import BaseGraphStorage, BaseOperator, BaseTokenizer
-from graphgen.common import init_storage
+from graphgen.common.init_storage import init_storage
 from graphgen.models import (
     AnchorBFSPartitioner,
     BFSPartitioner,
diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py
@@ -1,7 +1,8 @@
 from typing import Tuple
 
 from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator
-from graphgen.common import init_llm, init_storage
+from graphgen.common.init_llm import init_llm
+from graphgen.common.init_storage import init_storage
 from graphgen.models import QuizGenerator
 from graphgen.utils import logger, run_concurrent
 
diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
@@ -3,7 +3,7 @@
 
 import ray
 
-from graphgen.common import init_storage
+from graphgen.common.init_storage import init_storage
 from graphgen.models import (
     CSVReader,
     JSONReader,
diff --git a/graphgen/operators/rephrase/__init__.py b/graphgen/operators/rephrase/__init__.py
@@ -0,0 +1 @@
+from .rephrase_service import RephraseService
diff --git a/graphgen/operators/rephrase/rephrase_service.py b/graphgen/operators/rephrase/rephrase_service.py
@@ -0,0 +1,62 @@
+from typing import Tuple
+
+from graphgen.bases import BaseLLMWrapper, BaseOperator
+from graphgen.common.init_llm import init_llm
+from graphgen.utils import run_concurrent
+
+
+class RephraseService(BaseOperator):
+    """
+    Generate question-answer pairs based on nodes and edges.
+    """
+
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        method: str = "aggregated",
+        **rephrase_kwargs,
+    ):
+        super().__init__(working_dir=working_dir, op_name="rephrase_service")
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.method = method
+        self.rephrase_kwargs = rephrase_kwargs
+
+        if self.method == "style_controlled":
+            from graphgen.models import StyleControlledRephraser
+
+            self.rephraser = StyleControlledRephraser(
+                self.llm_client,
+                style=rephrase_kwargs.get("style", "critical_analysis"),
+            )
+        else:
+            raise ValueError(f"Unsupported rephrase method: {self.method}")
+
+    def process(self, batch: list) -> Tuple[list, dict]:
+        """
+        Rephrase the texts in the batch.
+        :return: A tuple of (results, meta_updates)
+            results: A list of dicts containing rephrased texts. Each dict has the structure:
+                {"_trace_id": str, "content": str}
+            meta_updates: A dict mapping source IDs to lists of trace IDs for the rephrased texts.
+        """
+        final_results = []
+        meta_updates = {}
+
+        results = run_concurrent(
+            self.rephraser.rephrase,
+            batch,
+            desc="Rephrasing texts",
+            unit="batch",
+        )
+
+        for input_trace_id, rephrased in zip(
+            [item["_trace_id"] for item in batch], results
+        ):
+            if not rephrased:
+                continue
+            rephrased["_trace_id"] = self.get_trace_id(rephrased)
+            results.append(rephrased)
+            meta_updates.setdefault(input_trace_id, []).append(rephrased["_trace_id"])
+            final_results.append(rephrased)
+
+        return final_results, meta_updates
diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 from graphgen.bases import BaseOperator
-from graphgen.common import init_storage
+from graphgen.common.init_storage import init_storage
 from graphgen.utils import compute_content_hash, logger, run_concurrent
 
 
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
@@ -14,5 +14,6 @@
     VQA_GENERATION_PROMPT,
 )
 from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
+from .rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
diff --git a/graphgen/templates/rephrasing/__init__.py b/graphgen/templates/rephrasing/__init__.py
@@ -0,0 +1 @@
+from .style_controlled_rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py
@@ -0,0 +1,21 @@
+from .critical_analysis_rephrasing import CRITICAL_ANALYSIS_REPHRASING_PROMPTS
+from .cross_domain_analogy_rephrasing import CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS
+from .executive_summary_rephrasing import EXECUTIVE_SUMMARY_REPHRASING_PROMPTS
+from .first_person_narrative_rephrasing import FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS
+from .historical_evolution_perspective_rephrasing import (
+    HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
+)
+from .popular_science_rephrasing import POPULAR_SCIENCE_REPHRASING_PROMPTS
+from .qa_dialogue_format_rephrasing import QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS
+from .technical_deep_dive_rephrasing import TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS
+
+STYLE_CONTROLLED_REPHRASING_PROMPTS = {
+    "popular_science": POPULAR_SCIENCE_REPHRASING_PROMPTS,
+    "critical_analysis": CRITICAL_ANALYSIS_REPHRASING_PROMPTS,
+    "cross_domain_analogy": CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS,
+    "technical_deep_dive": TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS,
+    "executive_summary": EXECUTIVE_SUMMARY_REPHRASING_PROMPTS,
+    "first_person_narrative": FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS,
+    "historical_evolution_perspective": HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS,
+    "qa_dialogue_format": QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS,
+}
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/executive_summary_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/executive_summary_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/first_person_narrative_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/first_person_narrative_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/historical_evolution_perspective_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/historical_evolution_perspective_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/popular_science_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/popular_science_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/qa_dialogue_format_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/qa_dialogue_format_rephrasing.py
diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/technical_deep_dive_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/technical_deep_dive_rephrasing.py
diff --git a/requirements.txt b/requirements.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+python3 -m graphgen.run \`
	`2`	`+--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml`