ldbc · ayoubmoussaid · May 11, 2026 · May 12, 2026 · May 12, 2026 · May 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,13 @@
 # specific
 output/
 corpus/
+.history/
+dataset/
+broken_db/
+examples/Oracle_SQLPGQ_Instance/
+examples/generated_corpus/oracle_sqlpgq_*.json
+examples/generated_corpus/cypher_to_oracle_sqlpgq*.json
+test_oracle_sqlpgq_query.json
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -168,4 +175,4 @@ cython_debug/
 #.idea/
 
 # poetry
-poetry.lock
+poetry.lock
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ Awesome-Text2GQL is an AI-assisted framework for Text2GQL dataset construction.
 
 ### Generated Benchmark Dataset
 
-The [Text2GQL-Bench](https://arxiv.org/abs/2602.11745)'s dataset is generated by Awesome-Text2GQL framework. It contains 178,184 (Question, Query) pairs spanning 13 domains. The dataset is available at [Text2GQL-Bench_dataset](https://tugraph-web.oss-cn-beijing.aliyuncs.com/tugraph/datasets/text2gql/Text2GraphQueryBenchmark/Text2GQL-Bench_dataset.zip). To run Text2GQL test, please refer to our [Text2GraphQuery-Driver](https://github.com/TuGraph-family/text2graphquery-driver/tree/main).
+The [Text2GQL-Bench](https://arxiv.org/abs/2602.11745)'s dataset is generated by Awesome-Text2GQL framework. It contains 178,184 (Question, Query) pairs spanning 13 domains. The dataset is available at [Text2GQL-Bench_dataset](https://tugraph-web.oss-cn-beijing.aliyuncs.com/tugraph/datasets/text2gql/Text2GraphQueryBenchmark/Text2GQL-Bench_dataset.zip). The dataset including the Oracle SQL/PGQ translated queries is available at [Dataset-with-SQL/PGQ](https://objectstorage.us-ashburn-1.oraclecloud.com/p/8dIkuVGsfnRQlP3ifxVDjQP0pmidpadEY18ltEbkPC4PrZyLTxjJdqDjbtWIEYUW/n/ogcs/b/Text2GQL-Bench_dataset/o/Text2GQL-Bench_dataset.zip), it includes 19633 out of 22407 existing queries. To run Text2GQL test, please refer to our [Text2GraphQuery-Driver](https://github.com/TuGraph-family/text2graphquery-driver/tree/main).
 
 ## Demo: TuGraph-DB ChatBot
 
@@ -195,6 +195,15 @@ After all, run:
 
 When the script finishes, the generated corpus will be saved to examples/generated_corpus/{graph_name}_template_corpus.json.
 
+#### Oracle SQL Property Graphs (SQL/PGQ)
+
+Awesome-Text2GQL includes Oracle SQL/PGQ support for schema conversion, graph setup, query translation, corpus generation, validation, and benchmark dataset preparation.
+
+For detailed workflows, see:
+
+- [Oracle SQL/PGQ data generation workflow](./doc/en-us/development/oracle_sqlpgq_data_generation_workflow.md): convert framework/TuGraph-style schemas into Oracle SQL/PGQ artifacts, create local Oracle property graphs, generate deterministic and LLM-based corpora, validate generated queries, and combine corpus outputs.
+- [Dataset preparation utilities](./dataset_prep/README.md): translate benchmark Cypher/GQL-like records to Oracle SQL/PGQ, optionally validate them against Oracle, analyze failures, compare Oracle SQL/PGQ results with Neo4j, and export validated datasets.
+
 #### Cypher2GQL
 
 `python ./examples/cypher2gql.py`

diff --git a/app/core/clauses/match_clause.py b/app/core/clauses/match_clause.py
@@ -24,14 +24,16 @@ class EdgePattern:
 class PathPattern:
     node_pattern_list: List[NodePattern]
     edge_pattern_list: List[EdgePattern]
+    path_variable: str = ""
 
 
 class MatchClause(Clause):
-    def __init__(self, path_pattern: PathPattern):
+    def __init__(self, path_pattern: PathPattern, optional: bool = False):
         self.path_pattern = path_pattern
+        self.optional = optional
 
     def to_string(self) -> str:
-        match_string = "MATCH "
+        match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
         path_degree = len(self.path_pattern.edge_pattern_list)
         # add first node
         node_pattern = self.path_pattern.node_pattern_list[0]
@@ -51,7 +53,7 @@ def to_string(self) -> str:
         return match_string
 
     def to_string_cypher(self) -> str:
-        match_string = "MATCH "
+        match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
         path_degree = len(self.path_pattern.edge_pattern_list)
         # add first node
         node_pattern = self.path_pattern.node_pattern_list[0]
@@ -82,7 +84,7 @@ def to_string_cypher(self) -> str:
         return match_string
 
     def to_string_gql(self) -> str:
-        match_string = "MATCH "
+        match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
         path_degree = len(self.path_pattern.edge_pattern_list)
         # add first node
         node_pattern = self.path_pattern.node_pattern_list[0]

diff --git a/app/core/clauses/return_clause.py b/app/core/clauses/return_clause.py
@@ -10,6 +10,7 @@ class ReturnItem:
     property: str
     alias: str
     function_name: str = ""
+    expression: str = ""
 
 
 @dataclass
@@ -18,6 +19,7 @@ class SortItem:
     property: str
     order: str
     function_name: str = ""
+    expression: str = ""
 
 
 @dataclass

diff --git a/app/core/clauses/where_clause.py b/app/core/clauses/where_clause.py
@@ -10,6 +10,7 @@ class CompareExpression:
     property: tuple[str, Dict]
     comparison_type: str
     comparison_value: str
+    raw_expression: str = ""
 
 
 class WhereClause(Clause):

diff --git a/app/core/generalizer/query_generalizer.py b/app/core/generalizer/query_generalizer.py
@@ -6,14 +6,29 @@
 from app.core.clauses.where_clause import CompareExpression, WhereClause
 from app.core.schema.schema_graph import SchemaGraph
 from app.core.schema.schema_parser import SchemaParser
+from app.impl.oracle_sqlpgq.schema.schema_parser import OracleSqlPgqSchemaParser
 from app.impl.tugraph_cypher.schema.schema_parser import TuGraphSchemaParser
 
 
 class QueryGeneralizer:
-    def __init__(self, db_id, instance_path):
+    SCHEMA_PARSERS = {
+        "tugraph_cypher": TuGraphSchemaParser,
+        "tugraph": TuGraphSchemaParser,
+        "oracle_sqlpgq": OracleSqlPgqSchemaParser,
+        "oracle": OracleSqlPgqSchemaParser,
+    }
+
+    def __init__(self, db_id, instance_path, backend: str = "tugraph_cypher"):
         self.db_id = db_id
         self.instance_path = instance_path
-        self.schema_parser: SchemaParser = TuGraphSchemaParser(db_id, instance_path)
+        self.backend = backend
+        parser_class = self.SCHEMA_PARSERS.get(backend)
+        if parser_class is None:
+            supported = ", ".join(sorted(self.SCHEMA_PARSERS))
+            raise ValueError(
+                f"Unsupported schema backend '{backend}'. Supported backends: {supported}"
+            )
+        self.schema_parser: SchemaParser = parser_class(db_id, instance_path)
         self.schema_graph: SchemaGraph = self.schema_parser.get_schema_graph()
 
     def generalize(self, query_pattern: List[Clause]) -> List[str]:
@@ -54,6 +69,11 @@ def generalize_from_llm(self, query_template: str) -> List[str]:
 
     def generalize_from_cypher(self, query_template: str) -> List[str]:
         # TODO: use original awesome-text2gql to generalize new query.
+        if self.backend not in {"tugraph_cypher", "tugraph"}:
+            raise NotImplementedError(
+                "generalize_from_cypher is backed by the TuGraph Cypher generalizer. "
+                "Use get_query_pattern + generalize + an Oracle translator for oracle_sqlpgq."
+            )
         from app.impl.tugraph_cypher.generalizer.graph_query_generalizer import (
             GraphQueryGeneralizer as CypherGeneralizer,
         )

diff --git a/app/core/generator/corpus_generator.py b/app/core/generator/corpus_generator.py
@@ -8,11 +8,46 @@
 
 
 class CorpusGenerator:
-    def __init__(self, llm_client: LlmClient):
+    def __init__(
+        self,
+        llm_client: LlmClient,
+        query_language: str = "cypher",
+        graph_name: str | None = None,
+    ):
         self.llm_client = llm_client
+        self.query_language = query_language.lower()
+        self.graph_name = graph_name
+
+    def _system_prompt(self) -> str:
+        if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
+            return corpus.SQLPGQ_SYSTEM_PROMPT
+        return corpus.SYSTEM_PROMPT
+
+    def _instruction_template(self) -> str:
+        if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
+            return corpus.SQLPGQ_INSTRUCTION_TEMPLATE
+        return corpus.INSTRUCTION_TEMPLATE
+
+    def _translation_prompt_template(self) -> str:
+        if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
+            return corpus.SQLPGQ_TRANSLATION_PROMPT_TEMPLATE
+        return corpus.TRANSLATION_PROMPT_TEMPLATE
+
+    def _query_template_instruction(self) -> str:
+        if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
+            return corpus.SQLPGQ_QUERY_TEMPLATE_INSTRUCTION
+        return corpus.QUERY_TEMPLATE_INSTRUCTION
+
+    def _query_archetypes(self) -> List[str]:
+        if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
+            return corpus.SQLPGQ_QUERY_ARCHETYPES
+        return corpus.QUERY_ARCHETYPES
 
     def _extract_json_from_response(self, response: str, expect_list: bool = True):
         """Extract JSON from LLM response."""
+        if not response:
+            print(" [Warning] Empty LLM response.")
+            return [] if expect_list else {}
         try:
             start_char, end_char = ("[", "]") if expect_list else ("{", "}")
             json_start = response.find(start_char)
@@ -40,7 +75,7 @@ def generate_questions_batch(
         all_questions = set()
 
         # Randomly select a query intent archetype to guide generation
-        archetype = random.choice(corpus.QUERY_ARCHETYPES)
+        archetype = random.choice(self._query_archetypes())
         print(f"Brainstorming questions with intent: '{archetype.split(':')[0]}'")
 
         instruction = corpus.EXPLORATION_PROMPT_TEMPLATE.format(
@@ -50,7 +85,7 @@ def generate_questions_batch(
             num_to_generate=questions_per_call,
         )
         message = [
-            {"role": "system", "content": corpus.SYSTEM_PROMPT},
+            {"role": "system", "content": self._system_prompt()},
             {"role": "user", "content": instruction},
         ]
 
@@ -69,16 +104,17 @@ def generate_translation_batch(
         self, schema_json: str, questions: List[str], error_context: Dict[str, str] = None
     ) -> List[Dict[str, Any]]:
         """
-        Translate a list of questions into Cypher queries.
+        Translate a list of questions into the configured graph query language.
         Supports retries by providing an error_context.
         """
-        instruction = corpus.TRANSLATION_PROMPT_TEMPLATE.format(
+        instruction = self._translation_prompt_template().format(
             schema_json=schema_json,
             question=questions[0],  # Assuming one question per call for clarity
+            graph_name=self.graph_name or "GRAPH_NAME",
             error_context=error_context if error_context else "",
         )
         message = [
-            {"role": "system", "content": corpus.SYSTEM_PROMPT},
+            {"role": "system", "content": self._system_prompt()},
             {"role": "user", "content": instruction},
         ]
 
@@ -193,13 +229,14 @@ def run_generation_loop(
                 selected_contexts = random_examples
 
                 # 1. Build Prompt
-                instruction = corpus.INSTRUCTION_TEMPLATE.format(
+                instruction = self._instruction_template().format(
                     schema_json=schema_json,
                     examples_json=json.dumps(selected_contexts, indent=2, ensure_ascii=False),
                     num_per_iteration=num_per_iteration,
+                    graph_name=self.graph_name or "GRAPH_NAME",
                 )
                 message = [
-                    {"role": "system", "content": corpus.SYSTEM_PROMPT},
+                    {"role": "system", "content": self._system_prompt()},
                     {"role": "user", "content": instruction},
                 ]
 
@@ -274,7 +311,7 @@ def generate_template_based_corpus(
             # 3. Construct the Prompt
             # We directly provide the "raw" data and ask the LLM to do three things: 
             # extract information, fill the template, and generate questions.
-            instraction = corpus.QUERY_TEMPLATE_INSTRUCTION.format(
+            instraction = self._query_template_instruction().format(
                 raw_data_str=raw_data_str,
                 current_batch_size=current_batch_size,
                 selected_templates=selected_templates,
@@ -283,7 +320,7 @@ def generate_template_based_corpus(
             message = [
                 {
                     "role": "system",
-                    "content": "You are a helpful assistant that generates Cypher datasets.",
+                    "content": self._system_prompt(),
                 },
                 {"role": "user", "content": instraction},
             ]