Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# specific
output/
corpus/
.history/
dataset/
broken_db/
examples/Oracle_SQLPGQ_Instance/
examples/generated_corpus/oracle_sqlpgq_*.json
examples/generated_corpus/cypher_to_oracle_sqlpgq*.json
test_oracle_sqlpgq_query.json

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -168,4 +175,4 @@ cython_debug/
#.idea/

# poetry
poetry.lock
poetry.lock
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Awesome-Text2GQL is an AI-assisted framework for Text2GQL dataset construction.

### Generated Benchmark Dataset

The [Text2GQL-Bench](https://arxiv.org/abs/2602.11745)'s dataset is generated by Awesome-Text2GQL framework. It contains 178,184 (Question, Query) pairs spanning 13 domains. The dataset is available at [Text2GQL-Bench_dataset](https://tugraph-web.oss-cn-beijing.aliyuncs.com/tugraph/datasets/text2gql/Text2GraphQueryBenchmark/Text2GQL-Bench_dataset.zip). To run Text2GQL test, please refer to our [Text2GraphQuery-Driver](https://github.com/TuGraph-family/text2graphquery-driver/tree/main).
The [Text2GQL-Bench](https://arxiv.org/abs/2602.11745)'s dataset is generated by Awesome-Text2GQL framework. It contains 178,184 (Question, Query) pairs spanning 13 domains. The dataset is available at [Text2GQL-Bench_dataset](https://tugraph-web.oss-cn-beijing.aliyuncs.com/tugraph/datasets/text2gql/Text2GraphQueryBenchmark/Text2GQL-Bench_dataset.zip). The dataset including the Oracle SQL/PGQ translated queries is available at [Dataset-with-SQL/PGQ](https://objectstorage.us-ashburn-1.oraclecloud.com/p/8dIkuVGsfnRQlP3ifxVDjQP0pmidpadEY18ltEbkPC4PrZyLTxjJdqDjbtWIEYUW/n/ogcs/b/Text2GQL-Bench_dataset/o/Text2GQL-Bench_dataset.zip), it includes 19633 out of 22407 existing queries. To run Text2GQL test, please refer to our [Text2GraphQuery-Driver](https://github.com/TuGraph-family/text2graphquery-driver/tree/main).

## Demo: TuGraph-DB ChatBot

Expand Down Expand Up @@ -195,6 +195,15 @@ After all, run:

When the script finishes, the generated corpus will be saved to examples/generated_corpus/{graph_name}_template_corpus.json.

#### Oracle SQL Property Graphs (SQL/PGQ)

Awesome-Text2GQL includes Oracle SQL/PGQ support for schema conversion, graph setup, query translation, corpus generation, validation, and benchmark dataset preparation.

For detailed workflows, see:

- [Oracle SQL/PGQ data generation workflow](./doc/en-us/development/oracle_sqlpgq_data_generation_workflow.md): convert framework/TuGraph-style schemas into Oracle SQL/PGQ artifacts, create local Oracle property graphs, generate deterministic and LLM-based corpora, validate generated queries, and combine corpus outputs.
- [Dataset preparation utilities](./dataset_prep/README.md): translate benchmark Cypher/GQL-like records to Oracle SQL/PGQ, optionally validate them against Oracle, analyze failures, compare Oracle SQL/PGQ results with Neo4j, and export validated datasets.

#### Cypher2GQL

`python ./examples/cypher2gql.py`
Expand Down
10 changes: 6 additions & 4 deletions app/core/clauses/match_clause.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@ class EdgePattern:
class PathPattern:
node_pattern_list: List[NodePattern]
edge_pattern_list: List[EdgePattern]
path_variable: str = ""


class MatchClause(Clause):
def __init__(self, path_pattern: PathPattern):
def __init__(self, path_pattern: PathPattern, optional: bool = False):
self.path_pattern = path_pattern
self.optional = optional

def to_string(self) -> str:
match_string = "MATCH "
match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
path_degree = len(self.path_pattern.edge_pattern_list)
# add first node
node_pattern = self.path_pattern.node_pattern_list[0]
Expand All @@ -51,7 +53,7 @@ def to_string(self) -> str:
return match_string

def to_string_cypher(self) -> str:
match_string = "MATCH "
match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
path_degree = len(self.path_pattern.edge_pattern_list)
# add first node
node_pattern = self.path_pattern.node_pattern_list[0]
Expand Down Expand Up @@ -82,7 +84,7 @@ def to_string_cypher(self) -> str:
return match_string

def to_string_gql(self) -> str:
match_string = "MATCH "
match_string = "OPTIONAL MATCH " if self.optional else "MATCH "
path_degree = len(self.path_pattern.edge_pattern_list)
# add first node
node_pattern = self.path_pattern.node_pattern_list[0]
Expand Down
2 changes: 2 additions & 0 deletions app/core/clauses/return_clause.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class ReturnItem:
property: str
alias: str
function_name: str = ""
expression: str = ""


@dataclass
Expand All @@ -18,6 +19,7 @@ class SortItem:
property: str
order: str
function_name: str = ""
expression: str = ""


@dataclass
Expand Down
1 change: 1 addition & 0 deletions app/core/clauses/where_clause.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class CompareExpression:
property: tuple[str, Dict]
comparison_type: str
comparison_value: str
raw_expression: str = ""


class WhereClause(Clause):
Expand Down
24 changes: 22 additions & 2 deletions app/core/generalizer/query_generalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,29 @@
from app.core.clauses.where_clause import CompareExpression, WhereClause
from app.core.schema.schema_graph import SchemaGraph
from app.core.schema.schema_parser import SchemaParser
from app.impl.oracle_sqlpgq.schema.schema_parser import OracleSqlPgqSchemaParser
from app.impl.tugraph_cypher.schema.schema_parser import TuGraphSchemaParser


class QueryGeneralizer:
def __init__(self, db_id, instance_path):
SCHEMA_PARSERS = {
"tugraph_cypher": TuGraphSchemaParser,
"tugraph": TuGraphSchemaParser,
"oracle_sqlpgq": OracleSqlPgqSchemaParser,
"oracle": OracleSqlPgqSchemaParser,
}

def __init__(self, db_id, instance_path, backend: str = "tugraph_cypher"):
self.db_id = db_id
self.instance_path = instance_path
self.schema_parser: SchemaParser = TuGraphSchemaParser(db_id, instance_path)
self.backend = backend
parser_class = self.SCHEMA_PARSERS.get(backend)
if parser_class is None:
supported = ", ".join(sorted(self.SCHEMA_PARSERS))
raise ValueError(
f"Unsupported schema backend '{backend}'. Supported backends: {supported}"
)
self.schema_parser: SchemaParser = parser_class(db_id, instance_path)
self.schema_graph: SchemaGraph = self.schema_parser.get_schema_graph()

def generalize(self, query_pattern: List[Clause]) -> List[str]:
Expand Down Expand Up @@ -54,6 +69,11 @@ def generalize_from_llm(self, query_template: str) -> List[str]:

def generalize_from_cypher(self, query_template: str) -> List[str]:
# TODO: use original awesome-text2gql to generalize new query.
if self.backend not in {"tugraph_cypher", "tugraph"}:
raise NotImplementedError(
"generalize_from_cypher is backed by the TuGraph Cypher generalizer. "
"Use get_query_pattern + generalize + an Oracle translator for oracle_sqlpgq."
)
from app.impl.tugraph_cypher.generalizer.graph_query_generalizer import (
GraphQueryGeneralizer as CypherGeneralizer,
)
Expand Down
57 changes: 47 additions & 10 deletions app/core/generator/corpus_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,46 @@


class CorpusGenerator:
def __init__(self, llm_client: LlmClient):
def __init__(
self,
llm_client: LlmClient,
query_language: str = "cypher",
graph_name: str | None = None,
):
self.llm_client = llm_client
self.query_language = query_language.lower()
self.graph_name = graph_name

def _system_prompt(self) -> str:
if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
return corpus.SQLPGQ_SYSTEM_PROMPT
return corpus.SYSTEM_PROMPT

def _instruction_template(self) -> str:
if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
return corpus.SQLPGQ_INSTRUCTION_TEMPLATE
return corpus.INSTRUCTION_TEMPLATE

def _translation_prompt_template(self) -> str:
if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
return corpus.SQLPGQ_TRANSLATION_PROMPT_TEMPLATE
return corpus.TRANSLATION_PROMPT_TEMPLATE

def _query_template_instruction(self) -> str:
if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
return corpus.SQLPGQ_QUERY_TEMPLATE_INSTRUCTION
return corpus.QUERY_TEMPLATE_INSTRUCTION

def _query_archetypes(self) -> List[str]:
if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}:
return corpus.SQLPGQ_QUERY_ARCHETYPES
return corpus.QUERY_ARCHETYPES

def _extract_json_from_response(self, response: str, expect_list: bool = True):
"""Extract JSON from LLM response."""
if not response:
print(" [Warning] Empty LLM response.")
return [] if expect_list else {}
try:
start_char, end_char = ("[", "]") if expect_list else ("{", "}")
json_start = response.find(start_char)
Expand Down Expand Up @@ -40,7 +75,7 @@ def generate_questions_batch(
all_questions = set()

# Randomly select a query intent archetype to guide generation
archetype = random.choice(corpus.QUERY_ARCHETYPES)
archetype = random.choice(self._query_archetypes())
print(f"Brainstorming questions with intent: '{archetype.split(':')[0]}'")

instruction = corpus.EXPLORATION_PROMPT_TEMPLATE.format(
Expand All @@ -50,7 +85,7 @@ def generate_questions_batch(
num_to_generate=questions_per_call,
)
message = [
{"role": "system", "content": corpus.SYSTEM_PROMPT},
{"role": "system", "content": self._system_prompt()},
{"role": "user", "content": instruction},
]

Expand All @@ -69,16 +104,17 @@ def generate_translation_batch(
self, schema_json: str, questions: List[str], error_context: Dict[str, str] = None
) -> List[Dict[str, Any]]:
"""
Translate a list of questions into Cypher queries.
Translate a list of questions into the configured graph query language.
Supports retries by providing an error_context.
"""
instruction = corpus.TRANSLATION_PROMPT_TEMPLATE.format(
instruction = self._translation_prompt_template().format(
schema_json=schema_json,
question=questions[0], # Assuming one question per call for clarity
graph_name=self.graph_name or "GRAPH_NAME",
error_context=error_context if error_context else "",
)
message = [
{"role": "system", "content": corpus.SYSTEM_PROMPT},
{"role": "system", "content": self._system_prompt()},
{"role": "user", "content": instruction},
]

Expand Down Expand Up @@ -193,13 +229,14 @@ def run_generation_loop(
selected_contexts = random_examples

# 1. Build Prompt
instruction = corpus.INSTRUCTION_TEMPLATE.format(
instruction = self._instruction_template().format(
schema_json=schema_json,
examples_json=json.dumps(selected_contexts, indent=2, ensure_ascii=False),
num_per_iteration=num_per_iteration,
graph_name=self.graph_name or "GRAPH_NAME",
)
message = [
{"role": "system", "content": corpus.SYSTEM_PROMPT},
{"role": "system", "content": self._system_prompt()},
{"role": "user", "content": instruction},
]

Expand Down Expand Up @@ -274,7 +311,7 @@ def generate_template_based_corpus(
# 3. Construct the Prompt
# We directly provide the "raw" data and ask the LLM to do three things:
# extract information, fill the template, and generate questions.
instraction = corpus.QUERY_TEMPLATE_INSTRUCTION.format(
instraction = self._query_template_instruction().format(
raw_data_str=raw_data_str,
current_batch_size=current_batch_size,
selected_templates=selected_templates,
Expand All @@ -283,7 +320,7 @@ def generate_template_based_corpus(
message = [
{
"role": "system",
"content": "You are a helpful assistant that generates Cypher datasets.",
"content": self._system_prompt(),
},
{"role": "user", "content": instraction},
]
Expand Down
Loading