From c0e813ef3a04786c20e8b2095df3d6f925f7b390 Mon Sep 17 00:00:00 2001 From: ken <2979602290@qq.com> Date: Thu, 22 Jan 2026 15:41:32 +0800 Subject: [PATCH 1/3] docs: update loader doc --- .../quickstart/toolchain/hugegraph-loader.md | 106 ++++++++++------- .../quickstart/toolchain/hugegraph-loader.md | 107 +++++++++++------- 2 files changed, 130 insertions(+), 83 deletions(-) diff --git a/content/cn/docs/quickstart/toolchain/hugegraph-loader.md b/content/cn/docs/quickstart/toolchain/hugegraph-loader.md index 9b088e4cb..c7ddfc982 100644 --- a/content/cn/docs/quickstart/toolchain/hugegraph-loader.md +++ b/content/cn/docs/quickstart/toolchain/hugegraph-loader.md @@ -807,47 +807,71 @@ schema: 必填 ##### 3.4.1 参数说明 -| 参数 | 默认值 | 是否必传 | 描述信息 | -|---------------------------|-----------|------|-------------------------------------------------------------------| -| `-f` 或 `--file` | | Y | 配置脚本的路径 | -| `-g` 或 `--graph` | | Y | 图名称 | -| `-gs` 或 `--graphspace` | DEFAULT | | 图空间 | -| `-s` 或 `--schema` | | Y | schema 文件路径 | -| `-h` 或 `--host` 或 `-i` | localhost | | HugeGraphServer 的地址 | -| `-p` 或 `--port` | 8080 | | HugeGraphServer 的端口号 | -| `--username` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 username | -| `--password` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 password | -| `--create-graph` | false | | 是否在图不存在时自动创建 | -| `--token` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 token | -| `--protocol` | http | | 向服务端发请求的协议,可选 http 或 https | -| `--pd-peers` | | | PD 服务节点地址 | -| `--pd-token` | | | 访问 PD 服务的 token | -| `--meta-endpoints` | | | 元信息存储服务地址 | -| `--direct` | false | | 是否直连 HugeGraph-Store | -| `--route-type` | NODE_PORT | | 路由选择方式(可选值:NODE_PORT / DDS / BOTH) | -| `--cluster` | hg | | 集群名 | -| `--trust-store-file` | | | 请求协议为 https 时,客户端的证书文件路径 | -| `--trust-store-password` | | | 请求协议为 https 时,客户端证书密码 | -| `--clear-all-data` | false | | 导入数据前是否清除服务端的原有数据 | -| `--clear-timeout` | 240 | | 导入数据前清除服务端的原有数据的超时时间 | -| `--incremental-mode` | false | | 是否使用断点续导模式,仅输入源为 FILE 和 HDFS 支持该模式,启用该模式能从上一次导入停止的地方开始导入 | -| `--failure-mode` | false | | 失败模式为 true 时,会导入之前失败了的数据,一般来说失败数据文件需要在人工更正编辑好后,再次进行导入 | -| `--batch-insert-threads` | CPUs | | 批量插入线程池大小 (CPUs 是当前 OS 可用**逻辑核**个数) | -| `--single-insert-threads` | 8 | | 单条插入线程池的大小 | -| `--max-conn` | 4 * CPUs | | HugeClient 与 HugeGraphServer 的最大 HTTP 连接数,**调整线程**的时候建议同时调整此项 | -| `--max-conn-per-route` | 2 * CPUs | | HugeClient 与 HugeGraphServer 每个路由的最大 HTTP 连接数,**调整线程**的时候建议同时调整此项 | -| `--batch-size` | 500 | | 导入数据时每个批次包含的数据条数 | -| `--max-parse-errors` | 1 | | 最多允许多少行数据解析错误,达到该值则程序退出 | -| `--max-insert-errors` | 500 | | 最多允许多少行数据插入错误,达到该值则程序退出 | -| `--timeout` | 60 | | 插入结果返回的超时时间(秒) | -| `--shutdown-timeout` | 10 | | 多线程停止的等待时间(秒) | -| `--retry-times` | 0 | | 发生特定异常时的重试次数 | -| `--retry-interval` | 10 | | 重试之前的间隔时间(秒) | -| `--check-vertex` | false | | 插入边时是否检查边所连接的顶点是否存在 | -| `--print-progress` | true | | 是否在控制台实时打印导入条数 | -| `--dry-run` | false | | 打开该模式,只解析不导入,通常用于测试 | -| `--help` | false | | 打印帮助信息 | - +| 参数 | 默认值 | 是否必传 | 描述信息 | +|-----------------------------------------|-------------|------|-------------------------------------------------------------------| +| `-f` 或 `--file` | | Y | 配置脚本的路径 | +| `-g` 或 `--graph` | | Y | 图名称 | +| `--graphspace` | DEFAULT | | 图空间 | +| `-s` 或 `--schema` | | Y | schema 文件路径 | +| `-h` 或 `--host` 或 `-i` | localhost | | HugeGraphServer 的地址 | +| `-p` 或 `--port` | 8080 | | HugeGraphServer 的端口号 | +| `--username` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 username | +| `--password` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 password | +| `--create-graph` | false | | 是否在图不存在时自动创建 | +| `--token` | null | | 当 HugeGraphServer 开启了权限认证时,当前图的 token | +| `--protocol` | http | | 向服务端发请求的协议,可选 http 或 https | +| `--pd-peers` | | | PD 服务节点地址 | +| `--pd-token` | | | 访问 PD 服务的 token | +| `--meta-endpoints` | | | 元信息存储服务地址 | +| `--direct` | false | | 是否直连 HugeGraph-Store | +| `--route-type` | NODE_PORT | | 路由选择方式(可选值:NODE_PORT / DDS / BOTH) | +| `--cluster` | hg | | 集群名 | +| `--trust-store-file` | | | 请求协议为 https 时,客户端的证书文件路径 | +| `--trust-store-password` | | | 请求协议为 https 时,客户端证书密码 | +| `--clear-all-data` | false | | 导入数据前是否清除服务端的原有数据 | +| `--clear-timeout` | 240 | | 导入数据前清除服务端的原有数据的超时时间 | +| `--incremental-mode` | false | | 是否使用断点续导模式,仅输入源为 FILE 和 HDFS 支持该模式,启用该模式能从上一次导入停止的地方开始导入 | +| `--failure-mode` | false | | 失败模式为 true 时,会导入之前失败了的数据,一般来说失败数据文件需要在人工更正编辑好后,再次进行导入 | +| `--batch-insert-threads` | CPUs | | 批量插入线程池大小 (CPUs 是当前 OS 可用**逻辑核**个数) | +| `--single-insert-threads` | 8 | | 单条插入线程池的大小 | +| `--max-conn` | 4 * CPUs | | HugeClient 与 HugeGraphServer 的最大 HTTP 连接数,**调整线程**的时候建议同时调整此项 | +| `--max-conn-per-route` | 2 * CPUs | | HugeClient 与 HugeGraphServer 每个路由的最大 HTTP 连接数,**调整线程**的时候建议同时调整此项 | +| `--batch-size` | 500 | | 导入数据时每个批次包含的数据条数 | +| `--max-parse-errors` | 1 | | 最多允许多少行数据解析错误,达到该值则程序退出 | +| `--max-insert-errors` | 500 | | 最多允许多少行数据插入错误,达到该值则程序退出 | +| `--timeout` | 60 | | 插入结果返回的超时时间(秒) | +| `--shutdown-timeout` | 10 | | 多线程停止的等待时间(秒) | +| `--retry-times` | 0 | | 发生特定异常时的重试次数 | +| `--retry-interval` | 10 | | 重试之前的间隔时间(秒) | +| `--check-vertex` | false | | 插入边时是否检查边所连接的顶点是否存在 | +| `--print-progress` | true | | 是否在控制台实时打印导入条数 | +| `--dry-run` | false | | 打开该模式,只解析不导入,通常用于测试 | +| `--help` 或 `-help` | false | | 打印帮助信息 | +| `--parallel-count` 或 `--parallel-count` | max(2,CPUS) | | 并行读取数据文件最大线程数 | +| `--start-file` | 0 | | 用于部分(分片)导入的起始文件索引 | +| `--end-file` | -1 | | 用于部分导入的截止文件索引 | +| `--scatter-sources` | false | | 分散(并行)读取多个数据源以优化 I/O 性能 | +| `--cdc-flush-interval` | 30000 | | Flink CDC 的数据刷新间隔 | +| `--cdc-sink-parallelism` | 1 | | Flink CDC 写入端(Sink)的并行度 | +| `--max-read-errors` | 1 | | 程序退出前允许的最大读取错误行数 | +| `--max-read-lines` | -1L | | 最大读取行数限制;一旦达到此行数,导入任务将停止 | +| `--test-mode` | false | | 是否开启测试模式 | +| `--use-prefilter` | false | | 是否预先过滤顶点 | +| `--short-id` | | | 将自定义 ID 映射为更短的 ID | +| `--vertex-edge-limit` | -1L | | 单个顶点的最大边数限制 | +| `--sink-type` | true | | 数据接收端(Sink)存储类型开关 | +| `--vertex-partitions` | 64 | | HBase 顶点表的预分区数量 | +| `--edge-partitions` | 64 | | HBase 边表的预分区数量 | +| `--vertex-table-name` | | | HBase 顶点表名称 | +| `--edge-table-name` | | | HBase 边表名称 | +| `--hbase-zk-quorum` | | | HBase Zookeeper 集群地址 | +| `--hbase-zk-port` | | | HBase Zookeeper 端口号 | +| `--hbase-zk-parent` | | | HBase Zookeeper 根路径 | +| `--restore` | false | | 将图模式设置为恢复模式 (RESTORING) | +| `--backend` | hstore | | 自动创建图(如果不存在)时的后端存储类型 | +| `--serializer` | binary | | 自动创建图(如果不存在)时的序列化器类型 | +| `--scheduler-type` | distributed | | 自动创建图(如果不存在)时的任务调度器类型 | +| `--batch-failure-fallback` | true | | 批量插入失败时是否回退至单条插入模式 | ##### 3.4.2 断点续导模式 通常情况下,Loader 任务都需要较长时间执行,如果因为某些原因导致导入中断进程退出,而下次希望能从中断的点继续导,这就是使用断点续导的场景。 diff --git a/content/en/docs/quickstart/toolchain/hugegraph-loader.md b/content/en/docs/quickstart/toolchain/hugegraph-loader.md index 6d14d05ae..bfd1925ca 100644 --- a/content/en/docs/quickstart/toolchain/hugegraph-loader.md +++ b/content/en/docs/quickstart/toolchain/hugegraph-loader.md @@ -794,48 +794,71 @@ The import process is controlled by commands submitted by the user, and the user ##### 3.4.1 Parameter description -| Parameter | Default value | Required or not | Description | -|---------------------------|---------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `-f` or `--file` | | Y | Path to configure script | -| `-g` or `--graph` | | Y | Graph name | -| `-gs` or `--graphspace` | DEFAULT | | Graph space name | -| `-s` or `--schema` | | Y | Schema file path | -| `-h` or `--host` or `-i` | localhost | | Address of HugeGraphServer | -| `-p` or `--port` | 8080 | | Port number of HugeGraphServer | -| `--username` | null | | When HugeGraphServer enables permission authentication, the username of the current graph | -| `--password` | null | | When HugeGraphServer enables permission authentication, the password of the current graph | -| `--create-graph` | false | | Whether to automatically create the graph if it does not exist | -| `--token` | null | | When HugeGraphServer has enabled authorization authentication, the token of the current graph | -| `--protocol` | http | | Protocol for sending requests to the server, optional http or https | -| `--pd-peers` | | | PD service node addresses | -| `--pd-token` | | | Token for accessing PD service | -| `--meta-endpoints` | | | Meta information storage service addresses | -| `--direct` | false | | Whether to directly connect to HugeGraph-Store | -| `--route-type` | NODE_PORT | | Route selection method (optional values: NODE_PORT / DDS / BOTH) | -| `--cluster` | hg | | Cluster name | -| `--trust-store-file` | | | When the request protocol is https, the client's certificate file path | -| `--trust-store-password` | | | When the request protocol is https, the client certificate password | -| `--clear-all-data` | false | | Whether to clear the original data on the server before importing data | -| `--clear-timeout` | 240 | | Timeout for clearing the original data on the server before importing data | -| `--incremental-mode` | false | | Whether to use the breakpoint resume mode; only input sources FILE and HDFS support this mode. Enabling this mode allows starting the import from where the last import stopped | -| `--failure-mode` | false | | When failure mode is true, previously failed data will be imported. Generally, the failed data file needs to be manually corrected and edited before re-importing | -| `--batch-insert-threads` | CPUs | | Batch insert thread pool size (CPUs is the number of **logical cores** available to the current OS) | -| `--single-insert-threads` | 8 | | Size of single insert thread pool | -| `--max-conn` | 4 * CPUs | | The maximum number of HTTP connections between HugeClient and HugeGraphServer; it is recommended to adjust this when **adjusting threads** | -| `--max-conn-per-route` | 2 * CPUs | | The maximum number of HTTP connections for each route between HugeClient and HugeGraphServer; it is recommended to adjust this item when **adjusting threads** | -| `--batch-size` | 500 | | The number of data items in each batch when importing data | -| `--max-parse-errors` | 1 | | The maximum number of data parsing errors allowed (per line); the program exits when this value is reached | -| `--max-insert-errors` | 500 | | The maximum number of data insertion errors allowed (per row); the program exits when this value is reached | -| `--timeout` | 60 | | Timeout (seconds) for insert result return | -| `--shutdown-timeout` | 10 | | Waiting time for multithreading to stop (seconds) | -| `--retry-times` | 0 | | Number of retries when a specific exception occurs | -| `--retry-interval` | 10 | | Interval before retry (seconds) | -| `--check-vertex` | false | | Whether to check if the vertices connected by the edge exist when inserting the edge | -| `--print-progress` | true | | Whether to print the number of imported items in real time on the console | -| `--dry-run` | false | | Enable this mode to only parse data without importing; usually used for testing | -| `--help` | false | | Print help information | - -##### 3.4.2 Breakpoint Continuation Mode +| Parameter | Default value | Required or not | Description | +|-----------------------------------------|---------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `-f` or `--file` | | Y | Path to configure script | +| `-g` or `--graph` | | Y | Graph name | +| `--graphspace` | DEFAULT | | Graph space name | +| `-s` or `--schema` | | Y | Schema file path | +| `-h` or `--host` or `-i` | localhost | | Address of HugeGraphServer | +| `-p` or `--port` | 8080 | | Port number of HugeGraphServer | +| `--username` | null | | When HugeGraphServer enables permission authentication, the username of the current graph | +| `--password` | null | | When HugeGraphServer enables permission authentication, the password of the current graph | +| `--create-graph` | false | | Whether to automatically create the graph if it does not exist | +| `--token` | null | | When HugeGraphServer has enabled authorization authentication, the token of the current graph | +| `--protocol` | http | | Protocol for sending requests to the server, optional http or https | +| `--pd-peers` | | | PD service node addresses | +| `--pd-token` | | | Token for accessing PD service | +| `--meta-endpoints` | | | Meta information storage service addresses | +| `--direct` | false | | Whether to directly connect to HugeGraph-Store | +| `--route-type` | NODE_PORT | | Route selection method (optional values: NODE_PORT / DDS / BOTH) | +| `--cluster` | hg | | Cluster name | +| `--trust-store-file` | | | When the request protocol is https, the client's certificate file path | +| `--trust-store-password` | | | When the request protocol is https, the client certificate password | +| `--clear-all-data` | false | | Whether to clear the original data on the server before importing data | +| `--clear-timeout` | 240 | | Timeout for clearing the original data on the server before importing data | +| `--incremental-mode` | false | | Whether to use the breakpoint resume mode; only input sources FILE and HDFS support this mode. Enabling this mode allows starting the import from where the last import stopped | +| `--failure-mode` | false | | When failure mode is true, previously failed data will be imported. Generally, the failed data file needs to be manually corrected and edited before re-importing | +| `--batch-insert-threads` | CPUs | | Batch insert thread pool size (CPUs is the number of **logical cores** available to the current OS) | +| `--single-insert-threads` | 8 | | Size of single insert thread pool | +| `--max-conn` | 4 * CPUs | | The maximum number of HTTP connections between HugeClient and HugeGraphServer; it is recommended to adjust this when **adjusting threads** | +| `--max-conn-per-route` | 2 * CPUs | | The maximum number of HTTP connections for each route between HugeClient and HugeGraphServer; it is recommended to adjust this item when **adjusting threads** | +| `--batch-size` | 500 | | The number of data items in each batch when importing data | +| `--max-parse-errors` | 1 | | The maximum number of data parsing errors allowed (per line); the program exits when this value is reached | +| `--max-insert-errors` | 500 | | The maximum number of data insertion errors allowed (per row); the program exits when this value is reached | +| `--timeout` | 60 | | Timeout (seconds) for insert result return | +| `--shutdown-timeout` | 10 | | Waiting time for multithreading to stop (seconds) | +| `--retry-times` | 0 | | Number of retries when a specific exception occurs | +| `--retry-interval` | 10 | | Interval before retry (seconds) | +| `--check-vertex` | false | | Whether to check if the vertices connected by the edge exist when inserting the edge | +| `--print-progress` | true | | Whether to print the number of imported items in real time on the console | +| `--dry-run` | false | | Enable this mode to only parse data without importing; usually used for testing | +| `--help` or `-help` | false | | Print help information | +| `--parallel-count` 或 `--parallel-count` | max(2,CPUS) | | Parallel read pipelines for data files | +| `--start-file` | 0 | | Start file index for partial loading | +| `--end-file` | -1 | | End file index for partial loading | +| `--scatter-sources` | false | | Scatter multiple sources for I/O optimization | +| `--cdc-flush-interval` | 30000 | | The flush interval for Flink CDC | +| `--cdc-sink-parallelism` | 1 | | The sink parallelism for Flink CDC | +| `--max-read-errors` | 1 | | The maximum number of read error lines before exiting | +| `--max-read-lines` | -1L | | The maximum number of read lines, task stops when reached | +| `--test-mode` | false | | Whether the loader works in test mode | +| `--use-prefilter` | false | | Whether to filter vertex in advance | +| `--short-id` | | | Mapping customized ID to shorter ID | +| `--vertex-edge-limit` | -1L | | The maximum number of vertex's edges | +| `--sink-type` | true | | Sink to different storage type switch | +| `--vertex-partitions` | 64 | | The number of partitions of the HBase vertex table | +| `--edge-partitions` | 64 | | The number of partitions of the HBase edge table | +| `--vertex-table-name` | | | HBase vertex table name | +| `--edge-table-name` | | | HBase edge table name | +| `--hbase-zk-quorum` | | | HBase ZooKeeper quorum | +| `--hbase-zk-port` | | | HBase ZooKeeper port | +| `--hbase-zk-parent` | | | HBase ZooKeeper parent | +| `--restore` | false | | Set graph mode to RESTORING | +| `--backend` | hstore | | The backend store type when creating graph if not exists | +| `--serializer` | binary | | The serializer type when creating graph if not exists | +| `--scheduler-type` | distributed | | The task scheduler type when creating graph if not exists | +| `--batch-failure-fallback` | true | | Whether to fallback to single insert when batch insert fails |##### 3.4.2 Breakpoint Continuation Mode Usually, the Loader task takes a long time to execute. If the import interrupt process exits for some reason, and next time you want to continue the import from the interrupted point, this is the scenario of using breakpoint continuation. From 5aa1263f6dfc51386554106177e8819a6ccc5689 Mon Sep 17 00:00:00 2001 From: ken <2979602290@qq.com> Date: Thu, 22 Jan 2026 15:42:45 +0800 Subject: [PATCH 2/3] docs: update loader doc --- content/en/docs/quickstart/toolchain/hugegraph-loader.md | 1 + 1 file changed, 1 insertion(+) diff --git a/content/en/docs/quickstart/toolchain/hugegraph-loader.md b/content/en/docs/quickstart/toolchain/hugegraph-loader.md index bfd1925ca..60fa02eef 100644 --- a/content/en/docs/quickstart/toolchain/hugegraph-loader.md +++ b/content/en/docs/quickstart/toolchain/hugegraph-loader.md @@ -859,6 +859,7 @@ The import process is controlled by commands submitted by the user, and the user | `--serializer` | binary | | The serializer type when creating graph if not exists | | `--scheduler-type` | distributed | | The task scheduler type when creating graph if not exists | | `--batch-failure-fallback` | true | | Whether to fallback to single insert when batch insert fails |##### 3.4.2 Breakpoint Continuation Mode +##### 3.4.2 Breakpoint Continuation Mode Usually, the Loader task takes a long time to execute. If the import interrupt process exits for some reason, and next time you want to continue the import from the interrupted point, this is the scenario of using breakpoint continuation. From 4c2b76b9e1de03dca10c8b065f9dbd26044c60ff Mon Sep 17 00:00:00 2001 From: ken <2979602290@qq.com> Date: Thu, 22 Jan 2026 18:03:05 +0800 Subject: [PATCH 3/3] docs: update loader doc --- .../quickstart/toolchain/hugegraph-loader.md | 50 +++---- .../quickstart/toolchain/hugegraph-loader.md | 130 +++++++++--------- 2 files changed, 90 insertions(+), 90 deletions(-) diff --git a/content/cn/docs/quickstart/toolchain/hugegraph-loader.md b/content/cn/docs/quickstart/toolchain/hugegraph-loader.md index c7ddfc982..d10419ed5 100644 --- a/content/cn/docs/quickstart/toolchain/hugegraph-loader.md +++ b/content/cn/docs/quickstart/toolchain/hugegraph-loader.md @@ -847,31 +847,31 @@ schema: 必填 | `--print-progress` | true | | 是否在控制台实时打印导入条数 | | `--dry-run` | false | | 打开该模式,只解析不导入,通常用于测试 | | `--help` 或 `-help` | false | | 打印帮助信息 | -| `--parallel-count` 或 `--parallel-count` | max(2,CPUS) | | 并行读取数据文件最大线程数 | -| `--start-file` | 0 | | 用于部分(分片)导入的起始文件索引 | -| `--end-file` | -1 | | 用于部分导入的截止文件索引 | -| `--scatter-sources` | false | | 分散(并行)读取多个数据源以优化 I/O 性能 | -| `--cdc-flush-interval` | 30000 | | Flink CDC 的数据刷新间隔 | -| `--cdc-sink-parallelism` | 1 | | Flink CDC 写入端(Sink)的并行度 | -| `--max-read-errors` | 1 | | 程序退出前允许的最大读取错误行数 | -| `--max-read-lines` | -1L | | 最大读取行数限制;一旦达到此行数,导入任务将停止 | -| `--test-mode` | false | | 是否开启测试模式 | -| `--use-prefilter` | false | | 是否预先过滤顶点 | -| `--short-id` | | | 将自定义 ID 映射为更短的 ID | -| `--vertex-edge-limit` | -1L | | 单个顶点的最大边数限制 | -| `--sink-type` | true | | 数据接收端(Sink)存储类型开关 | -| `--vertex-partitions` | 64 | | HBase 顶点表的预分区数量 | -| `--edge-partitions` | 64 | | HBase 边表的预分区数量 | -| `--vertex-table-name` | | | HBase 顶点表名称 | -| `--edge-table-name` | | | HBase 边表名称 | -| `--hbase-zk-quorum` | | | HBase Zookeeper 集群地址 | -| `--hbase-zk-port` | | | HBase Zookeeper 端口号 | -| `--hbase-zk-parent` | | | HBase Zookeeper 根路径 | -| `--restore` | false | | 将图模式设置为恢复模式 (RESTORING) | -| `--backend` | hstore | | 自动创建图(如果不存在)时的后端存储类型 | -| `--serializer` | binary | | 自动创建图(如果不存在)时的序列化器类型 | -| `--scheduler-type` | distributed | | 自动创建图(如果不存在)时的任务调度器类型 | -| `--batch-failure-fallback` | true | | 批量插入失败时是否回退至单条插入模式 | +| `--parser-threads` 或 `--parallel-count` | max(2,CPUS) | | 并行读取数据文件最大线程数 | +| `--start-file` | 0 | | 用于部分(分片)导入的起始文件索引 | +| `--end-file` | -1 | | 用于部分导入的截止文件索引 | +| `--scatter-sources` | false | | 分散(并行)读取多个数据源以优化 I/O 性能 | +| `--cdc-flush-interval` | 30000 | | Flink CDC 的数据刷新间隔 | +| `--cdc-sink-parallelism` | 1 | | Flink CDC 写入端(Sink)的并行度 | +| `--max-read-errors` | 1 | | 程序退出前允许的最大读取错误行数 | +| `--max-read-lines` | -1L | | 最大读取行数限制;一旦达到此行数,导入任务将停止 | +| `--test-mode` | false | | 是否开启测试模式 | +| `--use-prefilter` | false | | 是否预先过滤顶点 | +| `--short-id` | [] | | 将自定义 ID 映射为更短的 ID | +| `--vertex-edge-limit` | -1L | | 单个顶点的最大边数限制 | +| `--sink-type` | true | | 是否输出至不同的存储 | +| `--vertex-partitions` | 64 | | HBase 顶点表的预分区数量 | +| `--edge-partitions` | 64 | | HBase 边表的预分区数量 | +| `--vertex-table-name` | | | HBase 顶点表名称 | +| `--edge-table-name` | | | HBase 边表名称 | +| `--hbase-zk-quorum` | | | HBase Zookeeper 集群地址 | +| `--hbase-zk-port` | | | HBase Zookeeper 端口号 | +| `--hbase-zk-parent` | | | HBase Zookeeper 根路径 | +| `--restore` | false | | 将图模式设置为恢复模式 (RESTORING) | +| `--backend` | hstore | | 自动创建图(如果不存在)时的后端存储类型 | +| `--serializer` | binary | | 自动创建图(如果不存在)时的序列化器类型 | +| `--scheduler-type` | distributed | | 自动创建图(如果不存在)时的任务调度器类型 | +| `--batch-failure-fallback` | true | | 批量插入失败时是否回退至单条插入模式 | ##### 3.4.2 断点续导模式 通常情况下,Loader 任务都需要较长时间执行,如果因为某些原因导致导入中断进程退出,而下次希望能从中断的点继续导,这就是使用断点续导的场景。 diff --git a/content/en/docs/quickstart/toolchain/hugegraph-loader.md b/content/en/docs/quickstart/toolchain/hugegraph-loader.md index 60fa02eef..627ba7c13 100644 --- a/content/en/docs/quickstart/toolchain/hugegraph-loader.md +++ b/content/en/docs/quickstart/toolchain/hugegraph-loader.md @@ -794,71 +794,71 @@ The import process is controlled by commands submitted by the user, and the user ##### 3.4.1 Parameter description -| Parameter | Default value | Required or not | Description | -|-----------------------------------------|---------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `-f` or `--file` | | Y | Path to configure script | -| `-g` or `--graph` | | Y | Graph name | -| `--graphspace` | DEFAULT | | Graph space name | -| `-s` or `--schema` | | Y | Schema file path | -| `-h` or `--host` or `-i` | localhost | | Address of HugeGraphServer | -| `-p` or `--port` | 8080 | | Port number of HugeGraphServer | -| `--username` | null | | When HugeGraphServer enables permission authentication, the username of the current graph | -| `--password` | null | | When HugeGraphServer enables permission authentication, the password of the current graph | -| `--create-graph` | false | | Whether to automatically create the graph if it does not exist | -| `--token` | null | | When HugeGraphServer has enabled authorization authentication, the token of the current graph | -| `--protocol` | http | | Protocol for sending requests to the server, optional http or https | -| `--pd-peers` | | | PD service node addresses | -| `--pd-token` | | | Token for accessing PD service | -| `--meta-endpoints` | | | Meta information storage service addresses | -| `--direct` | false | | Whether to directly connect to HugeGraph-Store | -| `--route-type` | NODE_PORT | | Route selection method (optional values: NODE_PORT / DDS / BOTH) | -| `--cluster` | hg | | Cluster name | -| `--trust-store-file` | | | When the request protocol is https, the client's certificate file path | -| `--trust-store-password` | | | When the request protocol is https, the client certificate password | -| `--clear-all-data` | false | | Whether to clear the original data on the server before importing data | -| `--clear-timeout` | 240 | | Timeout for clearing the original data on the server before importing data | -| `--incremental-mode` | false | | Whether to use the breakpoint resume mode; only input sources FILE and HDFS support this mode. Enabling this mode allows starting the import from where the last import stopped | -| `--failure-mode` | false | | When failure mode is true, previously failed data will be imported. Generally, the failed data file needs to be manually corrected and edited before re-importing | -| `--batch-insert-threads` | CPUs | | Batch insert thread pool size (CPUs is the number of **logical cores** available to the current OS) | -| `--single-insert-threads` | 8 | | Size of single insert thread pool | -| `--max-conn` | 4 * CPUs | | The maximum number of HTTP connections between HugeClient and HugeGraphServer; it is recommended to adjust this when **adjusting threads** | -| `--max-conn-per-route` | 2 * CPUs | | The maximum number of HTTP connections for each route between HugeClient and HugeGraphServer; it is recommended to adjust this item when **adjusting threads** | -| `--batch-size` | 500 | | The number of data items in each batch when importing data | -| `--max-parse-errors` | 1 | | The maximum number of data parsing errors allowed (per line); the program exits when this value is reached | -| `--max-insert-errors` | 500 | | The maximum number of data insertion errors allowed (per row); the program exits when this value is reached | -| `--timeout` | 60 | | Timeout (seconds) for insert result return | -| `--shutdown-timeout` | 10 | | Waiting time for multithreading to stop (seconds) | -| `--retry-times` | 0 | | Number of retries when a specific exception occurs | -| `--retry-interval` | 10 | | Interval before retry (seconds) | -| `--check-vertex` | false | | Whether to check if the vertices connected by the edge exist when inserting the edge | -| `--print-progress` | true | | Whether to print the number of imported items in real time on the console | -| `--dry-run` | false | | Enable this mode to only parse data without importing; usually used for testing | -| `--help` or `-help` | false | | Print help information | -| `--parallel-count` 或 `--parallel-count` | max(2,CPUS) | | Parallel read pipelines for data files | -| `--start-file` | 0 | | Start file index for partial loading | -| `--end-file` | -1 | | End file index for partial loading | -| `--scatter-sources` | false | | Scatter multiple sources for I/O optimization | -| `--cdc-flush-interval` | 30000 | | The flush interval for Flink CDC | -| `--cdc-sink-parallelism` | 1 | | The sink parallelism for Flink CDC | -| `--max-read-errors` | 1 | | The maximum number of read error lines before exiting | -| `--max-read-lines` | -1L | | The maximum number of read lines, task stops when reached | -| `--test-mode` | false | | Whether the loader works in test mode | -| `--use-prefilter` | false | | Whether to filter vertex in advance | -| `--short-id` | | | Mapping customized ID to shorter ID | -| `--vertex-edge-limit` | -1L | | The maximum number of vertex's edges | -| `--sink-type` | true | | Sink to different storage type switch | -| `--vertex-partitions` | 64 | | The number of partitions of the HBase vertex table | -| `--edge-partitions` | 64 | | The number of partitions of the HBase edge table | -| `--vertex-table-name` | | | HBase vertex table name | -| `--edge-table-name` | | | HBase edge table name | -| `--hbase-zk-quorum` | | | HBase ZooKeeper quorum | -| `--hbase-zk-port` | | | HBase ZooKeeper port | -| `--hbase-zk-parent` | | | HBase ZooKeeper parent | -| `--restore` | false | | Set graph mode to RESTORING | -| `--backend` | hstore | | The backend store type when creating graph if not exists | -| `--serializer` | binary | | The serializer type when creating graph if not exists | -| `--scheduler-type` | distributed | | The task scheduler type when creating graph if not exists | -| `--batch-failure-fallback` | true | | Whether to fallback to single insert when batch insert fails |##### 3.4.2 Breakpoint Continuation Mode +| Parameter | Default value | Required or not | Description | +|------------------------------------------|---------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `-f` or `--file` | | Y | Path to configure script | +| `-g` or `--graph` | | Y | Graph name | +| `--graphspace` | DEFAULT | | Graph space name | +| `-s` or `--schema` | | Y | Schema file path | +| `-h` or `--host` or `-i` | localhost | | Address of HugeGraphServer | +| `-p` or `--port` | 8080 | | Port number of HugeGraphServer | +| `--username` | null | | When HugeGraphServer enables permission authentication, the username of the current graph | +| `--password` | null | | When HugeGraphServer enables permission authentication, the password of the current graph | +| `--create-graph` | false | | Whether to automatically create the graph if it does not exist | +| `--token` | null | | When HugeGraphServer has enabled authorization authentication, the token of the current graph | +| `--protocol` | http | | Protocol for sending requests to the server, optional http or https | +| `--pd-peers` | | | PD service node addresses | +| `--pd-token` | | | Token for accessing PD service | +| `--meta-endpoints` | | | Meta information storage service addresses | +| `--direct` | false | | Whether to directly connect to HugeGraph-Store | +| `--route-type` | NODE_PORT | | Route selection method (optional values: NODE_PORT / DDS / BOTH) | +| `--cluster` | hg | | Cluster name | +| `--trust-store-file` | | | When the request protocol is https, the client's certificate file path | +| `--trust-store-password` | | | When the request protocol is https, the client certificate password | +| `--clear-all-data` | false | | Whether to clear the original data on the server before importing data | +| `--clear-timeout` | 240 | | Timeout for clearing the original data on the server before importing data | +| `--incremental-mode` | false | | Whether to use the breakpoint resume mode; only input sources FILE and HDFS support this mode. Enabling this mode allows starting the import from where the last import stopped | +| `--failure-mode` | false | | When failure mode is true, previously failed data will be imported. Generally, the failed data file needs to be manually corrected and edited before re-importing | +| `--batch-insert-threads` | CPUs | | Batch insert thread pool size (CPUs is the number of **logical cores** available to the current OS) | +| `--single-insert-threads` | 8 | | Size of single insert thread pool | +| `--max-conn` | 4 * CPUs | | The maximum number of HTTP connections between HugeClient and HugeGraphServer; it is recommended to adjust this when **adjusting threads** | +| `--max-conn-per-route` | 2 * CPUs | | The maximum number of HTTP connections for each route between HugeClient and HugeGraphServer; it is recommended to adjust this item when **adjusting threads** | +| `--batch-size` | 500 | | The number of data items in each batch when importing data | +| `--max-parse-errors` | 1 | | The maximum number of data parsing errors allowed (per line); the program exits when this value is reached | +| `--max-insert-errors` | 500 | | The maximum number of data insertion errors allowed (per row); the program exits when this value is reached | +| `--timeout` | 60 | | Timeout (seconds) for insert result return | +| `--shutdown-timeout` | 10 | | Waiting time for multithreading to stop (seconds) | +| `--retry-times` | 0 | | Number of retries when a specific exception occurs | +| `--retry-interval` | 10 | | Interval before retry (seconds) | +| `--check-vertex` | false | | Whether to check if the vertices connected by the edge exist when inserting the edge | +| `--print-progress` | true | | Whether to print the number of imported items in real time on the console | +| `--dry-run` | false | | Enable this mode to only parse data without importing; usually used for testing | +| `--help` or `-help` | false | | Print help information | +| `--parser-threads` or `--parallel-count` | max(2,CPUS) | | Parallel read pipelines for data files | +| `--start-file` | 0 | | Start file index for partial loading | +| `--end-file` | -1 | | End file index for partial loading | +| `--scatter-sources` | false | | Scatter multiple sources for I/O optimization | +| `--cdc-flush-interval` | 30000 | | The flush interval for Flink CDC | +| `--cdc-sink-parallelism` | 1 | | The sink parallelism for Flink CDC | +| `--max-read-errors` | 1 | | The maximum number of read error lines before exiting | +| `--max-read-lines` | -1L | | The maximum number of read lines, task stops when reached | +| `--test-mode` | false | | Whether the loader works in test mode | +| `--use-prefilter` | false | | Whether to filter vertex in advance | +| `--short-id` | | | Mapping customized ID to shorter ID | +| `--vertex-edge-limit` | -1L | | The maximum number of vertex's edges | +| `--sink-type` | true | | Sink to different storage type switch | +| `--vertex-partitions` | 64 | | The number of partitions of the HBase vertex table | +| `--edge-partitions` | 64 | | The number of partitions of the HBase edge table | +| `--vertex-table-name` | | | HBase vertex table name | +| `--edge-table-name` | | | HBase edge table name | +| `--hbase-zk-quorum` | | | HBase ZooKeeper quorum | +| `--hbase-zk-port` | | | HBase ZooKeeper port | +| `--hbase-zk-parent` | | | HBase ZooKeeper parent | +| `--restore` | false | | Set graph mode to RESTORING | +| `--backend` | hstore | | The backend store type when creating graph if not exists | +| `--serializer` | binary | | The serializer type when creating graph if not exists | +| `--scheduler-type` | distributed | | The task scheduler type when creating graph if not exists | +| `--batch-failure-fallback` | true | | Whether to fallback to single insert when batch insert fails |##### 3.4.2 Breakpoint Continuation Mode ##### 3.4.2 Breakpoint Continuation Mode Usually, the Loader task takes a long time to execute. If the import interrupt process exits for some reason, and next time you want to continue the import from the interrupted point, this is the scenario of using breakpoint continuation.