From 01491dfd44e6bda0a3dfdb54ee31cb53c75e1f6a Mon Sep 17 00:00:00 2001 From: ayoubmoussaid Date: Mon, 11 May 2026 10:08:35 +0100 Subject: [PATCH 01/11] feat: Add OracleDB support and related tests - Updated pyproject.toml to include oracledb dependency. - Introduced new test suite for translating Cypher queries to Oracle SQL/PGQ. - Implemented dataset preparation tests for Oracle integration. - Added live tests for OracleDB client functionality. - Created query generalizer and template instantiator for Oracle SQL/PGQ. - Enhanced corpus combiner to handle Oracle-specific queries and validation. - Included schema parser for generating Oracle DDL statements. --- .gitignore | 8 +- README.md | 91 ++ app/core/clauses/match_clause.py | 1 + app/core/clauses/return_clause.py | 2 + app/core/clauses/where_clause.py | 1 + app/core/generalizer/query_generalizer.py | 24 +- app/core/generator/corpus_generator.py | 57 +- app/core/llm/llm_client.py | 110 +- app/core/prompt/corpus.py | 224 +++- app/core/validator/validator.py | 77 +- app/impl/oracle_sqlpgq/__init__.py | 2 + .../oracle_sqlpgq/ast_visitor/__init__.py | 2 + .../ast_visitor/oracle_sqlpgq_ast_visitor.py | 293 +++++ app/impl/oracle_sqlpgq/db_client/__init__.py | 2 + .../db_client/oracle_db_client.py | 114 ++ app/impl/oracle_sqlpgq/generator/__init__.py | 2 + .../generator/corpus_combiner.py | 174 +++ .../generator/query_generalizer.py | 155 +++ .../generator/template_instantiator.py | 343 ++++++ app/impl/oracle_sqlpgq/schema/__init__.py | 2 + .../oracle_sqlpgq/schema/schema_parser.py | 447 +++++++ app/impl/oracle_sqlpgq/translator/__init__.py | 2 + .../oracle_sqlpgq_query_translator.py | 1092 +++++++++++++++++ app/impl/oracle_sqlpgq/utils/__init__.py | 2 + app/impl/oracle_sqlpgq/utils/sqlpgq.py | 181 +++ .../ast_visitor/tugraph_cypher_ast_visitor.py | 175 ++- dataset_prep/__init__.py | 2 + dataset_prep/analyze_failures.py | 230 ++++ dataset_prep/discover.py | 85 ++ dataset_prep/oracle_loader.py | 225 ++++ dataset_prep/preflight.py | 66 + dataset_prep/reporting.py | 56 + dataset_prep/translate_validate.py | 287 +++++ .../oracle_sqlpgq_data_generation_workflow.md | 555 +++++++++ examples/combine_oracle_sqlpgq_corpus.py | 69 ++ examples/cypher2oracle_sqlpgq.py | 342 ++++++ .../generalize_oracle_sqlpgq_from_cypher.py | 60 + examples/generate_corpus_oracle_sqlpgq.py | 285 +++++ .../generate_corpus_oracle_sqlpgq_fraud.py | 227 ++++ .../generate_oracle_sqlpgq_template_corpus.py | 48 + examples/setup_oracle_sqlpgq_example_db.py | 544 ++++++++ examples/setup_oracle_sqlpgq_fraud_db.py | 494 ++++++++ examples/tugraph_to_oracle_sqlpgq.py | 51 + pyproject.toml | 6 +- test/test_cypher2oracle_sqlpgq.py | 442 +++++++ test/test_dataset_prep.py | 186 +++ test/test_oracle_sqlpgq.py | 153 +++ test/test_oracle_sqlpgq_corpus_combiner.py | 79 ++ test/test_oracle_sqlpgq_live.py | 31 + test/test_oracle_sqlpgq_query_generalizer.py | 79 ++ ...est_oracle_sqlpgq_template_instantiator.py | 78 ++ 51 files changed, 8201 insertions(+), 62 deletions(-) create mode 100644 app/impl/oracle_sqlpgq/__init__.py create mode 100644 app/impl/oracle_sqlpgq/ast_visitor/__init__.py create mode 100644 app/impl/oracle_sqlpgq/ast_visitor/oracle_sqlpgq_ast_visitor.py create mode 100644 app/impl/oracle_sqlpgq/db_client/__init__.py create mode 100644 app/impl/oracle_sqlpgq/db_client/oracle_db_client.py create mode 100644 app/impl/oracle_sqlpgq/generator/__init__.py create mode 100644 app/impl/oracle_sqlpgq/generator/corpus_combiner.py create mode 100644 app/impl/oracle_sqlpgq/generator/query_generalizer.py create mode 100644 app/impl/oracle_sqlpgq/generator/template_instantiator.py create mode 100644 app/impl/oracle_sqlpgq/schema/__init__.py create mode 100644 app/impl/oracle_sqlpgq/schema/schema_parser.py create mode 100644 app/impl/oracle_sqlpgq/translator/__init__.py create mode 100644 app/impl/oracle_sqlpgq/translator/oracle_sqlpgq_query_translator.py create mode 100644 app/impl/oracle_sqlpgq/utils/__init__.py create mode 100644 app/impl/oracle_sqlpgq/utils/sqlpgq.py create mode 100644 dataset_prep/__init__.py create mode 100644 dataset_prep/analyze_failures.py create mode 100644 dataset_prep/discover.py create mode 100644 dataset_prep/oracle_loader.py create mode 100644 dataset_prep/preflight.py create mode 100644 dataset_prep/reporting.py create mode 100644 dataset_prep/translate_validate.py create mode 100644 doc/en-us/development/oracle_sqlpgq_data_generation_workflow.md create mode 100644 examples/combine_oracle_sqlpgq_corpus.py create mode 100644 examples/cypher2oracle_sqlpgq.py create mode 100644 examples/generalize_oracle_sqlpgq_from_cypher.py create mode 100644 examples/generate_corpus_oracle_sqlpgq.py create mode 100644 examples/generate_corpus_oracle_sqlpgq_fraud.py create mode 100644 examples/generate_oracle_sqlpgq_template_corpus.py create mode 100644 examples/setup_oracle_sqlpgq_example_db.py create mode 100644 examples/setup_oracle_sqlpgq_fraud_db.py create mode 100644 examples/tugraph_to_oracle_sqlpgq.py create mode 100644 test/test_cypher2oracle_sqlpgq.py create mode 100644 test/test_dataset_prep.py create mode 100644 test/test_oracle_sqlpgq.py create mode 100644 test/test_oracle_sqlpgq_corpus_combiner.py create mode 100644 test/test_oracle_sqlpgq_live.py create mode 100644 test/test_oracle_sqlpgq_query_generalizer.py create mode 100644 test/test_oracle_sqlpgq_template_instantiator.py diff --git a/.gitignore b/.gitignore index bec1719..664b196 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,12 @@ # specific output/ corpus/ +.history/ +dataset/ +examples/Oracle_SQLPGQ_Instance/ +examples/generated_corpus/oracle_sqlpgq_*.json +examples/generated_corpus/cypher_to_oracle_sqlpgq*.json +test_oracle_sqlpgq_query.json # Byte-compiled / optimized / DLL files __pycache__/ @@ -168,4 +174,4 @@ cython_debug/ #.idea/ # poetry -poetry.lock \ No newline at end of file +poetry.lock diff --git a/README.md b/README.md index 01b6d47..b5e923b 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,97 @@ After all, run: When the script finishes, the generated corpus will be saved to examples/generated_corpus/{graph_name}_template_corpus.json. +#### Convert Schema to Oracle SQL Property Graphs (SQL/PGQ) + +`python ./examples/tugraph_to_oracle_sqlpgq.py` + +This example converts a framework/TuGraph-style schema JSON into Oracle SQL/PGQ artifacts: + +- relational vertex and edge table DDL +- `CREATE OR REPLACE PROPERTY GRAPH` DDL +- Oracle artifact manifest JSON +- `python-oracledb` CSV loader template + +Oracle SQL/PGQ query generation and validation are available through: + +```python +from app.core.generator.corpus_generator import CorpusGenerator +from app.core.llm.llm_client import LlmClient +from app.core.validator.validator import CorpusValidator +from app.impl.oracle_sqlpgq.translator.oracle_sqlpgq_query_translator import ( + OracleSqlPgqQueryTranslator, +) + +llm_client = LlmClient(model="qwen3-coder-plus-2025-07-22") +translator = OracleSqlPgqQueryTranslator(graph_name="TEXT2GQL_GRAPH") +validator = CorpusValidator(backend="oracle_sqlpgq", db_client_params={ + "dsn": "localhost:1521/FREEPDB1", + "user": "graph_user", + "password": "password", +}) +generator = CorpusGenerator(llm_client, query_language="oracle_sqlpgq") +``` + +To create and populate the example Oracle SQL/PGQ graph: + +```bash +export ORACLE_DSN='localhost:1521/FREEPDB1' +export ORACLE_USER='graph_user' +export ORACLE_PASSWORD='password' +python3 examples/setup_oracle_sqlpgq_example_db.py +``` + +To generate and validate an Oracle SQL/PGQ corpus with the configured LLM: + +```bash +export LLM_PLATFORM='dashscope' # or openai +export DASHSCOPE_API_KEY='...' +python3 examples/generate_corpus_oracle_sqlpgq.py +``` + +#### Translate Cypher to Oracle SQL/PGQ + +`examples/cypher2oracle_sqlpgq.py` translates supported Cypher queries into Oracle SQL/PGQ `GRAPH_TABLE` queries by reusing the framework's intermediate representation: + +```text +Cypher query -> Cypher grammar check -> Graph-IL -> Oracle SQL/PGQ translator -> GRAPH_TABLE query +``` + +For one query: + +```bash +poetry run python examples/cypher2oracle_sqlpgq.py \ + --query "MATCH (p:PERSON)-[a:ACTED_IN]->(m:MOVIE) RETURN m.title AS movie_title" \ + --graph-name TEXT2GQL_GRAPH +``` + +For a large query pool, provide a JSON file containing a plain list of Cypher query strings: + +```json +[ + "MATCH (p:PERSON) RETURN p.name AS person_name", + "MATCH (p:PERSON)-[a:ACTED_IN]->(m:MOVIE) WHERE p.name = 'Tom Hanks' RETURN m.title AS movie_title" +] +``` + +Then run: + +```bash +poetry run python examples/cypher2oracle_sqlpgq.py \ + --input examples/cypher_queries.json \ + --output examples/generated_corpus/cypher_to_oracle_sqlpgq.json \ + --graph-name TEXT2GQL_GRAPH +``` + +The output contains the source query, translated Oracle SQL/PGQ query, and a category: + +- `Graph-IL Translatable`: the query was translated successfully. +- `Not Comply with OpenCypher`: the source query failed the current Cypher grammar check. +- `Graph-IL Not Support`: the query parsed as Cypher but is outside the current Graph-IL visitor subset. +- `No Related Oracle SQL/PGQ Standard`: translation completed but failed SQL/PGQ validation. + +The current Graph-IL subset covers basic `MATCH` paths, simple property comparisons, `RETURN` items, aliases, aggregates, `ORDER BY`, `SKIP`, `LIMIT`, `DISTINCT`, and relationship hop ranges. More advanced Cypher features require extending the IR and visitor support first. + #### Cypher2GQL `python ./examples/cypher2gql.py` diff --git a/app/core/clauses/match_clause.py b/app/core/clauses/match_clause.py index 6ea30fb..fe9c91f 100644 --- a/app/core/clauses/match_clause.py +++ b/app/core/clauses/match_clause.py @@ -24,6 +24,7 @@ class EdgePattern: class PathPattern: node_pattern_list: List[NodePattern] edge_pattern_list: List[EdgePattern] + path_variable: str = "" class MatchClause(Clause): diff --git a/app/core/clauses/return_clause.py b/app/core/clauses/return_clause.py index 5a77211..634231c 100644 --- a/app/core/clauses/return_clause.py +++ b/app/core/clauses/return_clause.py @@ -10,6 +10,7 @@ class ReturnItem: property: str alias: str function_name: str = "" + expression: str = "" @dataclass @@ -18,6 +19,7 @@ class SortItem: property: str order: str function_name: str = "" + expression: str = "" @dataclass diff --git a/app/core/clauses/where_clause.py b/app/core/clauses/where_clause.py index db01918..84ee1cf 100644 --- a/app/core/clauses/where_clause.py +++ b/app/core/clauses/where_clause.py @@ -10,6 +10,7 @@ class CompareExpression: property: tuple[str, Dict] comparison_type: str comparison_value: str + raw_expression: str = "" class WhereClause(Clause): diff --git a/app/core/generalizer/query_generalizer.py b/app/core/generalizer/query_generalizer.py index 23581cd..77b372a 100644 --- a/app/core/generalizer/query_generalizer.py +++ b/app/core/generalizer/query_generalizer.py @@ -6,14 +6,29 @@ from app.core.clauses.where_clause import CompareExpression, WhereClause from app.core.schema.schema_graph import SchemaGraph from app.core.schema.schema_parser import SchemaParser +from app.impl.oracle_sqlpgq.schema.schema_parser import OracleSqlPgqSchemaParser from app.impl.tugraph_cypher.schema.schema_parser import TuGraphSchemaParser class QueryGeneralizer: - def __init__(self, db_id, instance_path): + SCHEMA_PARSERS = { + "tugraph_cypher": TuGraphSchemaParser, + "tugraph": TuGraphSchemaParser, + "oracle_sqlpgq": OracleSqlPgqSchemaParser, + "oracle": OracleSqlPgqSchemaParser, + } + + def __init__(self, db_id, instance_path, backend: str = "tugraph_cypher"): self.db_id = db_id self.instance_path = instance_path - self.schema_parser: SchemaParser = TuGraphSchemaParser(db_id, instance_path) + self.backend = backend + parser_class = self.SCHEMA_PARSERS.get(backend) + if parser_class is None: + supported = ", ".join(sorted(self.SCHEMA_PARSERS)) + raise ValueError( + f"Unsupported schema backend '{backend}'. Supported backends: {supported}" + ) + self.schema_parser: SchemaParser = parser_class(db_id, instance_path) self.schema_graph: SchemaGraph = self.schema_parser.get_schema_graph() def generalize(self, query_pattern: List[Clause]) -> List[str]: @@ -54,6 +69,11 @@ def generalize_from_llm(self, query_template: str) -> List[str]: def generalize_from_cypher(self, query_template: str) -> List[str]: # TODO: use original awesome-text2gql to generalize new query. + if self.backend not in {"tugraph_cypher", "tugraph"}: + raise NotImplementedError( + "generalize_from_cypher is backed by the TuGraph Cypher generalizer. " + "Use get_query_pattern + generalize + an Oracle translator for oracle_sqlpgq." + ) from app.impl.tugraph_cypher.generalizer.graph_query_generalizer import ( GraphQueryGeneralizer as CypherGeneralizer, ) diff --git a/app/core/generator/corpus_generator.py b/app/core/generator/corpus_generator.py index 6ca45be..f69ea60 100644 --- a/app/core/generator/corpus_generator.py +++ b/app/core/generator/corpus_generator.py @@ -8,11 +8,46 @@ class CorpusGenerator: - def __init__(self, llm_client: LlmClient): + def __init__( + self, + llm_client: LlmClient, + query_language: str = "cypher", + graph_name: str | None = None, + ): self.llm_client = llm_client + self.query_language = query_language.lower() + self.graph_name = graph_name + + def _system_prompt(self) -> str: + if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}: + return corpus.SQLPGQ_SYSTEM_PROMPT + return corpus.SYSTEM_PROMPT + + def _instruction_template(self) -> str: + if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}: + return corpus.SQLPGQ_INSTRUCTION_TEMPLATE + return corpus.INSTRUCTION_TEMPLATE + + def _translation_prompt_template(self) -> str: + if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}: + return corpus.SQLPGQ_TRANSLATION_PROMPT_TEMPLATE + return corpus.TRANSLATION_PROMPT_TEMPLATE + + def _query_template_instruction(self) -> str: + if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}: + return corpus.SQLPGQ_QUERY_TEMPLATE_INSTRUCTION + return corpus.QUERY_TEMPLATE_INSTRUCTION + + def _query_archetypes(self) -> List[str]: + if self.query_language in {"oracle_sqlpgq", "sqlpgq", "sql/pgq"}: + return corpus.SQLPGQ_QUERY_ARCHETYPES + return corpus.QUERY_ARCHETYPES def _extract_json_from_response(self, response: str, expect_list: bool = True): """Extract JSON from LLM response.""" + if not response: + print(" [Warning] Empty LLM response.") + return [] if expect_list else {} try: start_char, end_char = ("[", "]") if expect_list else ("{", "}") json_start = response.find(start_char) @@ -40,7 +75,7 @@ def generate_questions_batch( all_questions = set() # Randomly select a query intent archetype to guide generation - archetype = random.choice(corpus.QUERY_ARCHETYPES) + archetype = random.choice(self._query_archetypes()) print(f"Brainstorming questions with intent: '{archetype.split(':')[0]}'") instruction = corpus.EXPLORATION_PROMPT_TEMPLATE.format( @@ -50,7 +85,7 @@ def generate_questions_batch( num_to_generate=questions_per_call, ) message = [ - {"role": "system", "content": corpus.SYSTEM_PROMPT}, + {"role": "system", "content": self._system_prompt()}, {"role": "user", "content": instruction}, ] @@ -69,16 +104,17 @@ def generate_translation_batch( self, schema_json: str, questions: List[str], error_context: Dict[str, str] = None ) -> List[Dict[str, Any]]: """ - Translate a list of questions into Cypher queries. + Translate a list of questions into the configured graph query language. Supports retries by providing an error_context. """ - instruction = corpus.TRANSLATION_PROMPT_TEMPLATE.format( + instruction = self._translation_prompt_template().format( schema_json=schema_json, question=questions[0], # Assuming one question per call for clarity + graph_name=self.graph_name or "GRAPH_NAME", error_context=error_context if error_context else "", ) message = [ - {"role": "system", "content": corpus.SYSTEM_PROMPT}, + {"role": "system", "content": self._system_prompt()}, {"role": "user", "content": instruction}, ] @@ -193,13 +229,14 @@ def run_generation_loop( selected_contexts = random_examples # 1. Build Prompt - instruction = corpus.INSTRUCTION_TEMPLATE.format( + instruction = self._instruction_template().format( schema_json=schema_json, examples_json=json.dumps(selected_contexts, indent=2, ensure_ascii=False), num_per_iteration=num_per_iteration, + graph_name=self.graph_name or "GRAPH_NAME", ) message = [ - {"role": "system", "content": corpus.SYSTEM_PROMPT}, + {"role": "system", "content": self._system_prompt()}, {"role": "user", "content": instruction}, ] @@ -274,7 +311,7 @@ def generate_template_based_corpus( # 3. Construct the Prompt # We directly provide the "raw" data and ask the LLM to do three things: # extract information, fill the template, and generate questions. - instraction = corpus.QUERY_TEMPLATE_INSTRUCTION.format( + instraction = self._query_template_instruction().format( raw_data_str=raw_data_str, current_batch_size=current_batch_size, selected_templates=selected_templates, @@ -283,7 +320,7 @@ def generate_template_based_corpus( message = [ { "role": "system", - "content": "You are a helpful assistant that generates Cypher datasets.", + "content": self._system_prompt(), }, {"role": "user", "content": instraction}, ] diff --git a/app/core/llm/llm_client.py b/app/core/llm/llm_client.py index f23528e..da1e679 100644 --- a/app/core/llm/llm_client.py +++ b/app/core/llm/llm_client.py @@ -1,4 +1,5 @@ from http import HTTPStatus +import json import os import random import time @@ -77,20 +78,117 @@ def call_with_messages_local(self, messages): def call_with_messages_online_for_openai(self, messages): try: - openai_client = OpenAI( - api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL") - ) + client_kwargs = { + "api_key": os.getenv("OPENAI_API_KEY"), + "base_url": os.getenv("OPENAI_BASE_URL"), + } + default_headers = self._openai_default_headers() + if default_headers: + client_kwargs["default_headers"] = default_headers + + openai_client = OpenAI(**client_kwargs) response = openai_client.chat.completions.create( model=self.model, messages=messages, temperature=0 ) - return response.choices[0].message.content + return self._extract_openai_chat_content(response) except openai.RateLimitError: print("there are too many request,ready to retry in 1 second") time.sleep(1) print("begin to retry") return self.call_with_messages_online_for_openai(messages) - except OpenAIError: - print("Failed!", messages[1]["content"]) + except OpenAIError as exc: + print( + "OpenAI-compatible LLM call failed: " + f"{type(exc).__name__}: {exc}" + ) + return "" + + def _extract_openai_chat_content(self, response) -> str: + if isinstance(response, str): + stream_content = self._extract_openai_sse_content(response) + if stream_content: + return stream_content + try: + response = json.loads(response) + except json.JSONDecodeError: + preview = response[:500].replace("\n", "\\n") + print( + "OpenAI-compatible LLM returned a raw string instead of a " + f"chat completion object: {preview}" + ) + return response + + if isinstance(response, dict): + choices = response.get("choices") or [] + if choices: + message = choices[0].get("message") or {} + content = message.get("content") + if content is not None: + return content + preview = json.dumps(response, default=str)[:500] + print(f"OpenAI-compatible LLM returned unexpected JSON: {preview}") + return "" + + try: + return response.choices[0].message.content or "" + except (AttributeError, IndexError, KeyError, TypeError) as exc: + preview = repr(response)[:500] + print( + "OpenAI-compatible LLM returned an unexpected response type " + f"({type(response).__name__}): {preview}; parse error: {exc}" + ) + return "" + + def _extract_openai_sse_content(self, response_text: str) -> str: + if not response_text.lstrip().startswith("data:"): + return "" + + content_parts: list[str] = [] + for line in response_text.splitlines(): + line = line.strip() + if not line.startswith("data:"): + continue + + payload = line.removeprefix("data:").strip() + if not payload or payload == "[DONE]": + continue + + try: + event = json.loads(payload) + except json.JSONDecodeError: + continue + + for choice in event.get("choices", []): + delta = choice.get("delta") or {} + if delta.get("content"): + content_parts.append(delta["content"]) + + message = choice.get("message") or {} + if message.get("content"): + content_parts.append(message["content"]) + + if choice.get("text"): + content_parts.append(choice["text"]) + + if not content_parts: + preview = response_text[:500].replace("\n", "\\n") + print(f"OpenAI-compatible LLM returned an empty SSE stream: {preview}") + return "" + return "".join(content_parts) + + def _openai_default_headers(self) -> dict[str, str]: + raw_headers = os.getenv("OPENAI_EXTRA_HEADERS") or os.getenv("OPENAI_HTTP_HEADERS") + if not raw_headers: + return {} + try: + parsed = json.loads(raw_headers) + except json.JSONDecodeError as exc: + print(f"Invalid OPENAI_EXTRA_HEADERS JSON: {exc}") + return {} + if not isinstance(parsed, dict): + print("OPENAI_EXTRA_HEADERS must be a JSON object.") + return {} + return {str(key): str(value) for key, value in parsed.items()} def call_with_messages_online_for_dashscope(self, messages): response = Generation.call( diff --git a/app/core/prompt/corpus.py b/app/core/prompt/corpus.py index dbc0919..a13b069 100644 --- a/app/core/prompt/corpus.py +++ b/app/core/prompt/corpus.py @@ -8,6 +8,55 @@ Your output must be in strict JSON format, use English, as a list containing multiple objects. """ # noqa: E501 +SQLPGQ_SYSTEM_PROMPT = """ +You are an Oracle SQL/PGQ expert generating executable training data. +SQL/PGQ extends SQL with property-graph pattern matching through GRAPH_TABLE. + +Core rules: +- Use only the supplied graph name, labels, edge labels, and properties. Do not invent schema. +- Use property graph comments or validated examples as semantic guidance when present. +- Return exactly the JSON shape requested by the user prompt. Do not add explanations. +- Every graph access must use GRAPH_TABLE. The final SQL may be a direct GRAPH_TABLE query, + a CTE/subquery around GRAPH_TABLE, a join between GRAPH_TABLE and relational tables, or a + UNION ALL that combines graph results with normal SQL results. +- MATCH vertices with (v IS "LABEL") or (v); match edges with -[e IS "EDGE"]->, <-[e IS "EDGE"]-, or -[e IS "EDGE"]-. +- Declare variables for every vertex and edge that is referenced in WHERE, COLUMNS, SELECT, ORDER BY, GROUP BY, or HAVING. +- Never use Cypher or PGQL syntax: no (v:LABEL), no {{prop: value}}, no [:EDGE], no RETURN. +- Use double quotes for graph, label, and property identifiers; use single quotes only for string literals. +- Access properties inside GRAPH_TABLE as v."PROPERTY" or e."PROPERTY". +- Add graph WHERE predicates only when the question asks for a filter or the filter is required. Do not invent arbitrary literals. +- COLUMNS must not be empty. Alias every projected expression. Outside GRAPH_TABLE, refer to COLUMNS aliases, not graph variables. +- Project graph elements as VERTEX_ID(v) or EDGE_ID(e), not as raw vertex or edge variables. +- Put outer SQL operations such as ORDER BY, GROUP BY, HAVING, OFFSET, and FETCH outside GRAPH_TABLE. +- For aggregation, project needed values in COLUMNS and aggregate in outer SQL unless using path aggregates inside a quantified path. +- Use VERTEX_EQUAL(v1, v2) to compare vertices; do not compare vertices with v1 = v2. +- Avoid reserved words as variable names; use suffixes such as start_vertex or end_vertex. +- Keep parentheses balanced and avoid unsupported quantifiers such as *? or +?. + +Advanced SQL/PGQ patterns to generate when they fit the schema and examples: +- Mixed SQL plus SQL/PGQ: WITH clauses, inline views, joins to base tables, EXISTS/NOT EXISTS, + UNION ALL branches with compatible column counts and types, CASE expressions, analytic + functions such as ROW_NUMBER(), and outer GROUP BY/HAVING/ORDER BY/FETCH. +- Bounded recursive paths: use supported quantifiers such as *, +, ?, {n}, {n,}, and {n,m} + only where Oracle SQL/PGQ accepts them. Prefer bounded forms such as {1,3} for validation. +- ONE ROW PER MATCH, ONE ROW PER VERTEX (v), and ONE ROW PER STEP (src, edge, dst) when + path unnesting is requested. Iterator variables must be unique, must not appear in MATCH or + graph WHERE, and may be referenced only in COLUMNS. +- MATCHNUM(), PATH_NAME(), and ELEMENT_NUMBER(iterator) may be used in COLUMNS with + ONE ROW PER queries. +- LISTAGG, JSON_ARRAYAGG, MIN, MAX, AVG, SUM, and COUNT may be used in COLUMNS for path + aggregates when the variable is grouped by a quantified path; otherwise aggregate outside + GRAPH_TABLE. +- VERTEX_ID, EDGE_ID, VERTEX_EQUAL, EDGE_EQUAL, and IS SOURCE/DESTINATION OF predicates are + valid SQL/PGQ tools when the question asks for element identity or edge direction checks. + +Shortest-path caution: +- For Oracle database SQL property graph queries, do not generate PGQL-only path search goals + such as KEEP SHORTEST, ANY SHORTEST, ALL SHORTEST, ANY CHEAPEST, COST, TOTAL_COST, or + MATCH path_variable = (...) unless a validated example in the prompt uses that exact syntax. + Prefer a bounded path query with ORDER BY/FETCH or ONE ROW PER STEP for live DB validation. +""" # noqa: E501 + INSTRUCTION_TEMPLATE = """ # Command @@ -46,6 +95,78 @@ ] """ # noqa: E501 +SQLPGQ_INSTRUCTION_TEMPLATE = """ +# Command +Generate {num_per_iteration} new "question-query" data pairs based on the following information. + +# 1. Graph Schema +This is the Schema definition of the Oracle SQL property graph: +```json +{schema_json} +``` + +Graph name to use in every GRAPH_TABLE call: +```text +{graph_name} +``` + +2. Verified Query Examples (Context) +Here are some verified "question-query-result" examples that execute successfully. +```json +{examples_json} +``` + +3. Your Task +Generate {num_per_iteration} new, meaningful Oracle SQL/PGQ "question-query" data pairs. + +Basic direct GRAPH_TABLE shape: +```sql +SELECT * +FROM GRAPH_TABLE ( + "{graph_name}" + MATCH (n IS "LABEL")-[e IS "EDGE"]->(m IS "LABEL") + WHERE n."property" = 'value' + COLUMNS (m."property" AS property_alias) +) gt +``` + +Guidelines: +- Use "{graph_name}" as the graph name in every query. +- Use only schema labels, edge labels, and property names. +- Use concrete literal values from verified examples/results when filtering. +- If no known literal value is available, generate a broader query without that literal. +- Do not add a WHERE clause unless the question requires a filter. +- Always give every projected expression in COLUMNS an AS alias. +- Use COLUMNS aliases in outer SELECT, JOIN, ORDER BY, GROUP BY, HAVING, OFFSET, and FETCH clauses. +- For counts, averages, and grouping, project required values in COLUMNS and aggregate outside GRAPH_TABLE unless the query is a path aggregate. +- Project identifiers with VERTEX_ID(v) or EDGE_ID(e) when the question asks for vertices, edges, or IDs. +- Generate a diverse mix. Include at least one advanced SQL/PGQ pattern when the schema supports it: + CTEs around GRAPH_TABLE, joins to base tables, UNION ALL with normal SQL, analytic functions, + outer GROUP BY/HAVING, bounded path traversal, one-row-per-step path expansion, or element IDs. +- For UNION ALL, every branch must return the same number of columns with compatible data types. +- For joins to base tables, join using primary-key values projected from GRAPH_TABLE COLUMNS. +- For ONE ROW PER STEP, use unique iterator variables and project them only in COLUMNS; if unsure, + prefer a bounded multi-hop query that can be validated by Oracle. +- Do not generate KEEP SHORTEST, ANY SHORTEST, ALL SHORTEST, COST, or PGQL path-variable syntax + unless the verified examples include a working query using that exact form. +- Do not output Cypher, PGQL, placeholders, comments, explanations, or result fields. +- Return a strict JSON list of objects with "question" and "query" keys only. Do not include result fields. + +Valid example: +```json +[ + {{ + "question": "Which movies belong to the Science Fiction genre?", + "query": "SELECT * FROM GRAPH_TABLE (\"{graph_name}\" MATCH (m IS \"MOVIE\")-[b IS \"BELONGS_TO\"]->(g IS \"GENRE\") WHERE g.\"name\" = 'Science Fiction' COLUMNS (m.\"title\" AS movie_title, g.\"name\" AS genre_name)) gt" + }}, + {{ + "question": "Which graph-derived movies can also be checked against the base movie table?", + "query": "WITH graph_movies AS (SELECT gt.movie_id, gt.movie_title FROM GRAPH_TABLE (\"{graph_name}\" MATCH (m IS \"MOVIE\")-[b IS \"BELONGS_TO\"]->(g IS \"GENRE\") COLUMNS (m.\"MOVIE_id\" AS movie_id, m.\"title\" AS movie_title)) gt) SELECT gm.movie_title FROM graph_movies gm JOIN \"MOVIE\" m ON m.\"MOVIE_id\" = gm.movie_id" + }} +] +``` +""" # noqa: E501 + ENHANCEMENT_PROMPT_TEMPLATE = """ # Command Your task as a senior Cypher expert is to create more complex and insightful new "question-query" pairs based on existing queries. @@ -102,6 +223,17 @@ "Path Analysis and Traversal: Focus on analysis of paths themselves, such as finding the shortest path or all possible paths. Example: 'Find the shortest path between the type A node named [instance A] and the type B node named [instance B].'", # noqa: E501 ] +SQLPGQ_QUERY_ARCHETYPES = [ + "Direct Graph Pattern Query: Answer a question with one GRAPH_TABLE MATCH pattern, projected aliases, and optional graph WHERE filters.", # noqa: E501 + "Mixed SQL and SQL/PGQ Join: Use GRAPH_TABLE in a CTE or inline view, then join projected graph IDs or properties to normal relational tables.", # noqa: E501 + "UNION ALL Hybrid Query: Combine a normal SQL branch with a GRAPH_TABLE branch using compatible output columns and data types.", # noqa: E501 + "Outer SQL Aggregation: Project graph values in COLUMNS, then use COUNT, AVG, SUM, MIN, MAX, GROUP BY, HAVING, ORDER BY, or FETCH outside GRAPH_TABLE.", # noqa: E501 + "Analytic SQL Over Graph Results: Use ROW_NUMBER, RANK, DENSE_RANK, or partitioned aggregates over a GRAPH_TABLE result set.", # noqa: E501 + "Bounded Path Traversal: Use multi-hop or bounded quantified path patterns to ask reachability or chain questions while avoiding unsupported shortest-path goals.", # noqa: E501 + "One Row Per Path Expansion: Ask for path steps or path elements and generate ONE ROW PER STEP or ONE ROW PER VERTEX queries when supported by the validated examples.", # noqa: E501 + "Element Identity Query: Project VERTEX_ID or EDGE_ID, or compare graph elements with VERTEX_EQUAL/EDGE_EQUAL when identity matters.", # noqa: E501 +] + QUERY_TEMPLATE_INSTRUCTION = """ @@ -135,9 +267,38 @@ """ # noqa: E501 +SQLPGQ_QUERY_TEMPLATE_INSTRUCTION = """ + You are an Oracle SQL/PGQ query generator. + + I have run exploration queries on an Oracle SQL property graph and got the following RAW RESULT DATA. + + --- RAW DATA START --- + {raw_data_str} + --- RAW DATA END --- + + I also have a list of Oracle SQL/PGQ templates. + Generate {current_batch_size} new (Question, Query) pairs by filling these templates using REAL DATA extracted from the RAW DATA above. + + --- TEMPLATES --- + {selected_templates} + + --- CRITICAL RULES --- + 1. Use Oracle SQL/PGQ only: every graph access must use GRAPH_TABLE (... MATCH ... COLUMNS (...)). + 2. Use IS label syntax: (v IS "LABEL") and -[e IS "EDGE"]->. + 3. Never output Cypher syntax such as (v:LABEL), {{prop: value}}, [:EDGE], or RETURN. + 4. Use only labels, edge labels, properties, and literal values supported by the raw data/templates. + 5. Put graph filters in GRAPH_TABLE WHERE only when needed. + 6. Alias every COLUMNS expression and use those aliases in outer SQL. + 7. Mixed SQL is allowed: CTEs, joins with base tables, UNION ALL, analytic functions, + GROUP BY/HAVING, ORDER BY/FETCH, and bounded path patterns are valid when templates show them. + 8. Do not generate KEEP SHORTEST or PGQL path-variable syntax unless templates show a validated example. + 9. Output MUST be a strict JSON list of objects: [{{"question": "...", "query": "..."}}] + """ # noqa: E501 + + EXPLORATION_PROMPT_TEMPLATE = """ # Command -Your task is to brainstorm and generate diverse natural language questions. Focus on the breadth and depth of questions, without considering how to write Cypher queries for now. +Your task is to brainstorm and generate diverse natural language questions. Focus on the breadth and depth of questions, without writing the graph query yet. # 1. Graph Schema ```json @@ -197,3 +358,64 @@ "query": "MATCH (m:Movie) WHERE m.title = 'some movie' RETURN m" }} """ # noqa: E501 + +SQLPGQ_TRANSLATION_PROMPT_TEMPLATE = """ +Command +Your task as an Oracle SQL/PGQ expert is to accurately translate the given natural language question into an executable Oracle SQL property graph query. + +1. Graph Schema +```JSON +{schema_json} +``` + +Graph name to use in GRAPH_TABLE: +```text +{graph_name} +``` + +2. Question to be Translated +```json +{question} +``` + +3. Important Rules +- Use Oracle SQL/PGQ only, not Cypher and not PGQL. +- Every graph access must use GRAPH_TABLE ("{graph_name}" MATCH ... COLUMNS (...)). +- The final query may be direct GRAPH_TABLE SQL, or SQL wrapped with CTEs/subqueries, joins to + relational tables, UNION ALL, GROUP BY/HAVING, analytic functions, ORDER BY, OFFSET, or FETCH. +- Use labels, edge labels, and property names exactly as defined by the schema. +- Use "{graph_name}" as the graph name. +- Put graph-pattern predicates inside GRAPH_TABLE WHERE only when required by the question. +- Do not invent filter literals or placeholder predicates. +- Put projected values inside COLUMNS, and always give every projected value an AS alias. +- Use COLUMNS aliases in outer SELECT, JOIN, ORDER BY, GROUP BY, HAVING, OFFSET, and FETCH clauses. +- For aggregation, project the required value in COLUMNS and aggregate outside GRAPH_TABLE unless using a path aggregate. +- If the question asks for vertices or edges, project VERTEX_ID(v) or EDGE_ID(e). +- Never use Cypher forms such as (p:PERSON), {{NAME: 'Tom Hanks'}}, [:ACTED_IN], RETURN, or m.TITLE. +- Use Oracle SQL/PGQ forms such as (p IS "PERSON"), [a IS "ACTED_IN"], p."NAME", m."TITLE", and a."ROLE". +- COLUMNS must look like COLUMNS (m."TITLE" AS movie_title, a."ROLE" AS role), not COLUMNS (m."TITLE", a."ROLE"). +- For UNION ALL, each branch must return the same number of columns with compatible data types. +- For mixed SQL and SQL/PGQ joins, project graph IDs/properties in COLUMNS and join outside GRAPH_TABLE. +- For ONE ROW PER STEP, use unique iterator variables and reference them only in COLUMNS. +- Do not generate KEEP SHORTEST, ANY SHORTEST, ALL SHORTEST, COST, or PGQL path-variable syntax unless the error context or verified examples prove the target database accepts that exact form. +- Ensure every variable referenced in WHERE/COLUMNS is declared in MATCH. +- Ensure all parentheses are balanced. + +Correct Oracle SQL/PGQ example: +{{ +"query": "SELECT * FROM GRAPH_TABLE (\"{graph_name}\" MATCH (p IS \"PERSON\")-[a IS \"ACTED_IN\"]->(m IS \"MOVIE\") WHERE p.\"NAME\" = 'Tom Hanks' COLUMNS (m.\"TITLE\" AS movie_title, a.\"ROLE\" AS role)) gt" +}} + +Invalid Cypher-style example: +{{ +"query": "SELECT m.TITLE, e.ROLE FROM GRAPH_TABLE (MATCH (p:PERSON {{NAME: 'Tom Hanks'}})-[:ACTED_IN]->(m:MOVIE)) gt" +}} + +{error_context} + +4. Output Format +Return a JSON object containing only the "query" key. +{{ +"query": "SELECT ... FROM GRAPH_TABLE (...) gt" +}} +""" # noqa: E501 diff --git a/app/core/validator/validator.py b/app/core/validator/validator.py index db4e144..1108834 100644 --- a/app/core/validator/validator.py +++ b/app/core/validator/validator.py @@ -1,39 +1,92 @@ +from importlib import import_module import logging from typing import Any, Dict, List from app.core.validator.db_client import DB_Client, QueryResult, QueryStatus -from app.impl.tugraph_cypher.db_client.tugraph_db_client import TuGraphDBClient logger = logging.getLogger("CorpusValidator") class CorpusValidator: - def __init__(self, tu_client_params: dict): - # Store parameters instead of the client object itself + CLIENTS = { + "tugraph_cypher": ( + "app.impl.tugraph_cypher.db_client.tugraph_db_client", + "TuGraphDBClient", + ), + "tugraph": ( + "app.impl.tugraph_cypher.db_client.tugraph_db_client", + "TuGraphDBClient", + ), + "oracle_sqlpgq": ( + "app.impl.oracle_sqlpgq.db_client.oracle_db_client", + "OracleDBClient", + ), + "oracle": ( + "app.impl.oracle_sqlpgq.db_client.oracle_db_client", + "OracleDBClient", + ), + } + + def __init__( + self, + tu_client_params: dict | None = None, + backend: str = "tugraph_cypher", + db_client_params: dict | None = None, + db_client: DB_Client | None = None, + ): """ - Initialize validator and for instantiating TuGraph database client implementation. + Initialize validator and instantiate the selected database client implementation. + + Args: + tu_client_params: Backward-compatible TuGraph parameters. + backend: One of tugraph_cypher or oracle_sqlpgq. + db_client_params: Backend-specific connection parameters. + db_client: Optional prebuilt client, useful for tests. """ - self._tu_client_params = tu_client_params - self._client: DB_Client | None = None + self.backend = backend + self._client_params = db_client_params if db_client_params is not None else tu_client_params + self._client_params = self._client_params or {} + self._client: DB_Client | None = db_client - # Create connection during initialization and store the instance - self._client = TuGraphDBClient(self._tu_client_params) + if self._client is None: + client_class = self._resolve_client_class(backend) + self._client = client_class(self._client_params) - # Immediately check if connection was successful - if not self._client or not self._client.client: - logger.error("Database client failed to initialize or connect.") + if not self._is_client_ready(self._client): + logger.error( + f"Database client for backend '{backend}' failed to initialize or connect." + ) def _get_client(self) -> DB_Client | None: """Return the created client instance.""" return self._client + def _resolve_client_class(self, backend: str): + target = self.CLIENTS.get(backend) + if target is None: + supported = ", ".join(sorted(self.CLIENTS)) + raise ValueError( + f"Unsupported backend '{backend}'. Supported backends: {supported}" + ) + module_path, class_name = target + module = import_module(module_path) + return getattr(module, class_name) + + def _is_client_ready(self, client: DB_Client | None) -> bool: + if client is None: + return False + for attr in ("client", "connection", "driver"): + if hasattr(client, attr): + return getattr(client, attr) is not None + return True + def execute_with_results(self, pairs: List[Dict[str, str]]) -> List[Dict[str, Any]]: """ Validate all pairs and get query result, filter out pairs that fail execution or have empty results. """ client = self._get_client() - if not client or not client.client: + if not self._is_client_ready(client): logger.error("Database connection is not ready. Skipping validation.") raise Exception("Database connection is not ready.") diff --git a/app/impl/oracle_sqlpgq/__init__.py b/app/impl/oracle_sqlpgq/__init__.py new file mode 100644 index 0000000..e94768d --- /dev/null +++ b/app/impl/oracle_sqlpgq/__init__.py @@ -0,0 +1,2 @@ +"""Oracle SQL/PGQ backend support.""" + diff --git a/app/impl/oracle_sqlpgq/ast_visitor/__init__.py b/app/impl/oracle_sqlpgq/ast_visitor/__init__.py new file mode 100644 index 0000000..5770767 --- /dev/null +++ b/app/impl/oracle_sqlpgq/ast_visitor/__init__.py @@ -0,0 +1,2 @@ +"""Oracle SQL/PGQ AST visitor utilities.""" + diff --git a/app/impl/oracle_sqlpgq/ast_visitor/oracle_sqlpgq_ast_visitor.py b/app/impl/oracle_sqlpgq/ast_visitor/oracle_sqlpgq_ast_visitor.py new file mode 100644 index 0000000..ace2d04 --- /dev/null +++ b/app/impl/oracle_sqlpgq/ast_visitor/oracle_sqlpgq_ast_visitor.py @@ -0,0 +1,293 @@ +import re +from typing import List, Tuple + +from app.core.ast_visitor.ast_visitor import AstVisitor +from app.core.clauses.clause import Clause +from app.core.clauses.match_clause import EdgePattern, MatchClause, NodePattern, PathPattern +from app.core.clauses.return_clause import ReturnBody, ReturnClause, ReturnItem, SortItem +from app.core.clauses.where_clause import CompareExpression, WhereClause +from app.impl.oracle_sqlpgq.utils.sqlpgq import validate_graph_table_query + + +class OracleSqlPgqAstVisitor(AstVisitor): + """Parse the Oracle SQL/PGQ subset emitted by OracleSqlPgqQueryTranslator.""" + + NODE_RE = re.compile(r"\((?P[^()]*)\)", re.DOTALL) + EDGE_RE = re.compile( + r"(?P<)?-\[(?P[^\]]*)\]-(?P>)?(?P\{[^}]+\})?", + re.DOTALL, + ) + COMPARE_RE = re.compile( + r"(?P[A-Za-z_][A-Za-z0-9_]*)" + r"(?:\.(?:\"(?P[^\"]+)\"|(?P[A-Za-z_][A-Za-z0-9_]*)))?" + r"\s*(?P=|<>|<=|>=|<|>)\s*(?P.+)", + re.DOTALL, + ) + + def get_query_pattern(self, query: str) -> Tuple[bool, List[Clause]]: + if not validate_graph_table_query(query): + return False, [] + try: + body, graph_table_end = self._extract_graph_table_span(query) + match_text, where_text, columns_text = self._extract_sections(body) + clauses: List[Clause] = [] + for path_text in self._split_top_level(match_text): + clauses.append(MatchClause(self._parse_path(path_text))) + if where_text: + clauses.append(WhereClause(self._parse_where(where_text))) + return_items = self._parse_columns(columns_text) + if return_items: + sort_items, skip, limit = self._parse_outer_modifiers(query[graph_table_end:]) + clauses.append(ReturnClause(ReturnBody(return_items, sort_items, skip, limit))) + return True, clauses + except Exception: + return False, [] + + def _extract_graph_table_body(self, query: str) -> str: + return self._extract_graph_table_span(query)[0] + + def _extract_graph_table_span(self, query: str) -> Tuple[str, int]: + marker = re.search(r"GRAPH_TABLE\s*\(", query, re.IGNORECASE) + if marker is None: + raise ValueError("GRAPH_TABLE not found.") + start = marker.end() - 1 + depth = 0 + in_single = False + in_double = False + for index in range(start, len(query)): + char = query[index] + if char == "'" and not in_double: + in_single = not in_single + elif char == '"' and not in_single: + in_double = not in_double + elif not in_single and not in_double: + if char == "(": + depth += 1 + elif char == ")": + depth -= 1 + if depth == 0: + return query[start + 1 : index].strip(), index + 1 + raise ValueError("GRAPH_TABLE body is not balanced.") + + def _extract_sections(self, body: str) -> Tuple[str, str, str]: + upper = body.upper() + match_idx = upper.index("MATCH ") + columns_idx = upper.rindex("COLUMNS") + before_columns = body[match_idx + len("MATCH ") : columns_idx].strip() + where_idx = self._find_keyword(before_columns, "WHERE") + if where_idx == -1: + match_text = before_columns + where_text = "" + else: + match_text = before_columns[:where_idx].strip() + where_text = before_columns[where_idx + len("WHERE") :].strip() + columns_start = body.index("(", columns_idx) + columns_text = body[columns_start + 1 : body.rindex(")")].strip() + return match_text, where_text, columns_text + + def _find_keyword(self, text: str, keyword: str) -> int: + pattern = re.compile(rf"\b{keyword}\b", re.IGNORECASE) + for match in pattern.finditer(text): + prefix = text[: match.start()] + if prefix.count("(") == prefix.count(")") and prefix.count("[") == prefix.count("]"): + return match.start() + return -1 + + def _parse_path(self, text: str) -> PathPattern: + text = text.strip() + position = 0 + node_patterns: List[NodePattern] = [] + edge_patterns: List[EdgePattern] = [] + + first_node = self.NODE_RE.match(text, position) + if first_node is None: + raise ValueError("Path must start with a node pattern.") + node_patterns.append(self._parse_node(first_node.group("body"))) + position = first_node.end() + + while position < len(text): + edge_match = self.EDGE_RE.match(text, position) + if edge_match is None: + break + edge_patterns.append(self._parse_edge(edge_match)) + position = edge_match.end() + node_match = self.NODE_RE.match(text, position) + if node_match is None: + raise ValueError("Edge pattern must be followed by a node pattern.") + node_patterns.append(self._parse_node(node_match.group("body"))) + position = node_match.end() + + return PathPattern(node_patterns, edge_patterns) + + def _parse_node(self, body: str) -> NodePattern: + variable, label = self._parse_variable_and_label(body) + return NodePattern(variable, label, []) + + def _parse_edge(self, edge_match: re.Match) -> EdgePattern: + variable, label = self._parse_variable_and_label(edge_match.group("body")) + if edge_match.group("left"): + direction = "left" + elif edge_match.group("right"): + direction = "right" + else: + direction = "bidirection" + return EdgePattern(variable, label, [], direction, self._parse_hop(edge_match.group("hop"))) + + def _parse_variable_and_label(self, body: str) -> Tuple[str, str]: + body = body.strip() + if " WHERE " in body.upper(): + body = re.split(r"\bWHERE\b", body, flags=re.IGNORECASE)[0].strip() + label_pattern = ( + r"(?P[A-Za-z_][A-Za-z0-9_]*)" + r"(?:\s+IS\s+(?:\"(?P[^\"]+)\"|" + r"(?P