apache · weijinglin · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 15, 2025
diff --git a/hugegraph-llm/pyproject.toml b/hugegraph-llm/pyproject.toml
@@ -58,6 +58,7 @@ dependencies = [
     "apscheduler",
     "litellm",
     "hugegraph-python-client",
+    "pycgraph",
 ]
 [project.urls]
 homepage = "https://hugegraph.apache.org/"
@@ -85,3 +86,4 @@ allow-direct-references = true
 
 [tool.uv.sources]
 hugegraph-python-client = { workspace = true }
+pycgraph = { git = "https://github.com/ChunelFeng/CGraph.git", subdirectory = "python", rev = "main", marker = "sys_platform == 'linux'"  }
diff --git a/hugegraph-llm/src/hugegraph_llm/flows/__init__.py b/hugegraph-llm/src/hugegraph_llm/flows/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/hugegraph-llm/src/hugegraph_llm/flows/build_vector_index.py b/hugegraph-llm/src/hugegraph_llm/flows/build_vector_index.py
@@ -0,0 +1,55 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from hugegraph_llm.flows.common import BaseFlow
+from hugegraph_llm.state.ai_state import WkFlowInput
+
+import json
+from PyCGraph import GPipeline
+
+from hugegraph_llm.operators.document_op.chunk_split import ChunkSplitNode
+from hugegraph_llm.operators.index_op.build_vector_index import BuildVectorIndexNode
+from hugegraph_llm.state.ai_state import WkFlowState
+
+
+class BuildVectorIndexFlow(BaseFlow):
+    def __init__(self):
+        pass
+
+    def prepare(self, prepared_input: WkFlowInput, texts):
+        prepared_input.texts = texts
+        prepared_input.language = "zh"
+        prepared_input.split_type = "paragraph"
+        return
+
+    def build_flow(self, texts):
+        pipeline = GPipeline()
+        # prepare for workflow input
+        prepared_input = WkFlowInput()
+        self.prepare(prepared_input, texts)
+
+        pipeline.createGParam(prepared_input, "wkflow_input")
+        pipeline.createGParam(WkFlowState(), "wkflow_state")
+
+        chunk_split_node = ChunkSplitNode()
+        build_vector_node = BuildVectorIndexNode()
+        pipeline.registerGElement(chunk_split_node, set(), "chunk_split")
+        pipeline.registerGElement(build_vector_node, {chunk_split_node}, "build_vector")
+
+        return pipeline
+
+    def post_deal(self, pipeline=None):
+        res = pipeline.getGParamWithNoEmpty("wkflow_state").to_json()
+        return json.dumps(res, ensure_ascii=False, indent=2)
diff --git a/hugegraph-llm/src/hugegraph_llm/flows/common.py b/hugegraph-llm/src/hugegraph_llm/flows/common.py
@@ -0,0 +1,45 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from abc import ABC, abstractmethod
+
+from hugegraph_llm.state.ai_state import WkFlowInput
+
+
+class BaseFlow(ABC):
+    """
+    Base class for flows, defines three interface methods: prepare, build_flow, and post_deal.
+    """
+
+    @abstractmethod
+    def prepare(self, prepared_input: WkFlowInput, *args, **kwargs):
+        """
+        Pre-processing interface.
+        """
+        pass
+
+    @abstractmethod
+    def build_flow(self, *args, **kwargs):
+        """
+        Interface for building the flow.
+        """
+        pass
+
+    @abstractmethod
+    def post_deal(self, *args, **kwargs):
+        """
+        Post-processing interface.
+        """
+        pass
diff --git a/hugegraph-llm/src/hugegraph_llm/flows/graph_extract.py b/hugegraph-llm/src/hugegraph_llm/flows/graph_extract.py
@@ -0,0 +1,127 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import json
+from PyCGraph import GPipeline
+from hugegraph_llm.flows.common import BaseFlow
+from hugegraph_llm.state.ai_state import WkFlowInput, WkFlowState
+from hugegraph_llm.operators.common_op.check_schema import CheckSchemaNode
+from hugegraph_llm.operators.document_op.chunk_split import ChunkSplitNode
+from hugegraph_llm.operators.hugegraph_op.schema_manager import SchemaManagerNode
+from hugegraph_llm.operators.llm_op.info_extract import InfoExtractNode
+from hugegraph_llm.operators.llm_op.property_graph_extract import (
+    PropertyGraphExtractNode,
+)
+from hugegraph_llm.utils.log import log
+
+
+class GraphExtractFlow(BaseFlow):
+    def __init__(self):
+        pass
+
+    def _import_schema(
+        self,
+        from_hugegraph=None,
+        from_extraction=None,
+        from_user_defined=None,
+    ):
+        if from_hugegraph:
+            return SchemaManagerNode()
+        elif from_user_defined:
+            return CheckSchemaNode()
+        elif from_extraction:
+            raise NotImplementedError("Not implemented yet")
+        else:
+            raise ValueError("No input data / invalid schema type")
+
+    def prepare(
+        self, prepared_input: WkFlowInput, schema, texts, example_prompt, extract_type
+    ):
+        # prepare input data
+        prepared_input.texts = texts
+        prepared_input.language = "zh"
+        prepared_input.split_type = "document"
+        prepared_input.example_prompt = example_prompt
+        prepared_input.schema = schema
+        schema = schema.strip()
+        if schema.startswith("{"):
+            try:
+                schema = json.loads(schema)
+                prepared_input.schema = schema
+            except json.JSONDecodeError as exc:
+                log.error("Invalid JSON format in schema. Please check it again.")
+                raise ValueError("Invalid JSON format in schema.") from exc
+        else:
+            log.info("Get schema '%s' from graphdb.", schema)
+            prepared_input.graph_name = schema
+        return
+
+    def build_flow(self, schema, texts, example_prompt, extract_type):
+        pipeline = GPipeline()
+        prepared_input = WkFlowInput()
+        # prepare input data
+        self.prepare(prepared_input, schema, texts, example_prompt, extract_type)
+
+        pipeline.createGParam(prepared_input, "wkflow_input")
+        pipeline.createGParam(WkFlowState(), "wkflow_state")
+        schema = schema.strip()
+        schema_node = None
+        if schema.startswith("{"):
+            try:
+                schema = json.loads(schema)
+                schema_node = self._import_schema(from_user_defined=schema)
+            except json.JSONDecodeError as exc:
+                log.error("Invalid JSON format in schema. Please check it again.")
+                raise ValueError("Invalid JSON format in schema.") from exc
+        else:
+            log.info("Get schema '%s' from graphdb.", schema)
+            schema_node = self._import_schema(from_hugegraph=schema)
+
+        chunk_split_node = ChunkSplitNode()
+        graph_extract_node = None
+        if extract_type == "triples":
+            graph_extract_node = InfoExtractNode()
+        elif extract_type == "property_graph":
+            graph_extract_node = PropertyGraphExtractNode()
+        else:
+            raise ValueError(f"Unsupported extract_type: {extract_type}")
+        pipeline.registerGElement(schema_node, set(), "schema_node")
+        pipeline.registerGElement(chunk_split_node, set(), "chunk_split")
+        pipeline.registerGElement(
+            graph_extract_node, {schema_node, chunk_split_node}, "graph_extract"
+        )
+
+        return pipeline
+
+    def post_deal(self, pipeline=None):
+        res = pipeline.getGParamWithNoEmpty("wkflow_state").to_json()
+        vertices = res.get("vertices", [])
+        edges = res.get("edges", [])
+        if not vertices and not edges:
+            log.info("Please check the schema.(The schema may not match the Doc)")
+            return json.dumps(
+                {
+                    "vertices": vertices,
+                    "edges": edges,
+                    "warning": "The schema may not match the Doc",
+                },
+                ensure_ascii=False,
+                indent=2,
+            )
+        return json.dumps(
+            {"vertices": vertices, "edges": edges},
+            ensure_ascii=False,
+            indent=2,
+        )
diff --git a/hugegraph-llm/src/hugegraph_llm/flows/scheduler.py b/hugegraph-llm/src/hugegraph_llm/flows/scheduler.py
@@ -0,0 +1,90 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import threading
+from typing import Dict, Any
+from PyCGraph import GPipelineManager
+from hugegraph_llm.flows.build_vector_index import BuildVectorIndexFlow
+from hugegraph_llm.flows.common import BaseFlow
+from hugegraph_llm.flows.graph_extract import GraphExtractFlow
+from hugegraph_llm.utils.log import log
+
+
+class Scheduler:
+    pipeline_pool: Dict[str, Any] = None
+    max_pipeline: int
+
+    def __init__(self, max_pipeline: int = 10):
+        self.pipeline_pool = {}
+        # pipeline_pool act as a manager of GPipelineManager which used for pipeline management
+        self.pipeline_pool["build_vector_index"] = {
+            "manager": GPipelineManager(),
+            "flow": BuildVectorIndexFlow(),
+        }
+        self.pipeline_pool["graph_extract"] = {
+            "manager": GPipelineManager(),
+            "flow": GraphExtractFlow(),
+        }
+        self.max_pipeline = max_pipeline
+
+    # TODO: Implement Agentic Workflow
+    def agentic_flow(self):
+        pass
+
+    def schedule_flow(self, flow: str, *args, **kwargs):
+        if flow not in self.pipeline_pool:
+            raise ValueError(f"Unsupported workflow {flow}")
+        manager = self.pipeline_pool[flow]["manager"]
+        flow: BaseFlow = self.pipeline_pool[flow]["flow"]
+        pipeline = manager.fetch()
+        if pipeline is None:
+            # call coresponding flow_func to create new workflow
-            # call coresponding flow_func to create new workflow
+            # call corresponding flow_func to create new workflow
-            # call coresponding flow_func to create new workflow
+            # call corresponding flow_func to create new workflow
+            pipeline = flow.build_flow(*args, **kwargs)
+            status = pipeline.init()
+            if status.isErr():
+                error_msg = f"Error in flow init: {status.getInfo()}"
+                log.error(error_msg)
+                raise RuntimeError(error_msg)
+            status = pipeline.run()
+            if status.isErr():
+                error_msg = f"Error in flow execution: {status.getInfo()}"
+                log.error(error_msg)
+                raise RuntimeError(error_msg)
+            res = flow.post_deal(pipeline)
+            manager.add(pipeline)
+            return res
+        else:
+            # fetch pipeline & prepare input for flow
+            prepared_input = pipeline.getGParamWithNoEmpty("wkflow_input")
+            flow.prepare(prepared_input, *args, **kwargs)
+            status = pipeline.run()
+            if status.isErr():
+                raise RuntimeError(f"Error in flow execution {status.getInfo()}")
+            res = flow.post_deal(pipeline)
+            manager.release(pipeline)
+            return res
+
+
+class SchedulerSingleton:
+    _instance = None
+    _instance_lock = threading.Lock()
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            with cls._instance_lock:
+                if cls._instance is None:
+                    cls._instance = Scheduler()
+        return cls._instance