diff --git a/tidb/resources/system-tidb.conf b/tidb/resources/system-tidb.conf index 41d6fff8d..302c7cfc6 100644 --- a/tidb/resources/system-tidb.conf +++ b/tidb/resources/system-tidb.conf @@ -2,6 +2,9 @@ lease = "5s" split-table = true keyspace-name = "SYSTEM" +[instance] +tidb_service_scope = "dxf_service" + [log] slow-query-file = "system-slow.log" diff --git a/tidb/resources/tikv-worker.toml b/tidb/resources/tikv-worker.toml new file mode 100644 index 000000000..4975c6ac0 --- /dev/null +++ b/tidb/resources/tikv-worker.toml @@ -0,0 +1,7 @@ +# tikv-worker configuration for Jepsen tests +# Note: [dfs] section should be provided via manifest parameters (WORKER_CONFIG) +# for environment-specific S3/MinIO settings + +[ia] +mem-cap = "5GB" +disk-cap = "20GB" diff --git a/tidb/src/tidb/core.clj b/tidb/src/tidb/core.clj index adb6c9932..3132ecde2 100644 --- a/tidb/src/tidb/core.clj +++ b/tidb/src/tidb/core.clj @@ -162,12 +162,15 @@ :kill-pd :kill-kv :kill-db + :kill-tikv-worker :stop-pd :stop-kv :stop-db + :stop-tikv-worker :pause-pd :pause-kv :pause-db + :pause-tikv-worker :schedules :shuffle-leader :shuffle-region @@ -181,7 +184,9 @@ (def process-faults "Faults affecting individual processes" - [:kill-pd :kill-kv :kill-db :stop-pd :stop-kv :stop-db :pause-pd :pause-kv :pause-db]) + [:kill-pd :kill-kv :kill-db :kill-tikv-worker + :stop-pd :stop-kv :stop-db :stop-tikv-worker + :pause-pd :pause-kv :pause-db :pause-tikv-worker]) (def network-faults "Faults affecting the network" @@ -266,6 +271,10 @@ :color "#E9A0CF" :start #{:kill-db :stop-db} :stop #{:start-db}} + {:name "kill tikv-worker" + :color "#E9C0A0" + :start #{:kill-tikv-worker :stop-tikv-worker} + :stop #{:start-tikv-worker}} {:name "pause pd" :color "#C5A0E9" :start #{:pause-pd} @@ -278,6 +287,10 @@ :color "#A6A0E9" :start #{:pause-db} :stop #{:resume-db}} + {:name "pause tikv-worker" + :color "#D9C0A0" + :start #{:pause-tikv-worker} + :stop #{:resume-tikv-worker}} {:name "shuffle-leader" :color "#A6D0E9" :start #{:shuffle-leader} diff --git a/tidb/src/tidb/db.clj b/tidb/src/tidb/db.clj index 22322c912..98e499bdb 100644 --- a/tidb/src/tidb/db.clj +++ b/tidb/src/tidb/db.clj @@ -48,6 +48,13 @@ (def system-db-pid-file (str tidb-dir "/system-db.pid")) (def system-db-port 14000) (def system-db-status-port 11080) +(def tikv-worker-bin "tikv-worker") +(def tikv-worker-config-file (str tidb-dir "/tikv-worker.conf")) +(def tikv-worker-log-file (str tidb-dir "/tikv-worker.log")) +(def tikv-worker-stdout (str tidb-dir "/tikv-worker.stdout")) +(def tikv-worker-pid-file (str tidb-dir "/tikv-worker.pid")) +(def tikv-worker-data-dir (str tidb-dir "/data/tikv-worker")) +(def tikv-worker-port 19000) (def pd-services {:api {:bin "pd-api" @@ -133,6 +140,12 @@ (c/su (c/exec :echo (slurp (io/resource "system-tidb.conf")) :> system-db-config-file))) +(defn configure-tikv-worker! + "Writes configuration file for tikv-worker" + [] + (c/su (c/exec :echo (slurp (io/resource "tikv-worker.toml")) + :> tikv-worker-config-file))) + (defn configure! "Write all config files." [] @@ -298,6 +311,21 @@ :-P (str system-db-port) :--status (str system-db-status-port)))) +(defn start-tikv-worker! + "Starts the TiKV-Worker daemon" + [test node] + (c/su + (cu/start-daemon! + {:logfile tikv-worker-stdout + :pidfile tikv-worker-pid-file + :chdir tidb-dir} + (str "./bin/" tikv-worker-bin) + :--addr (str "0.0.0.0:" tikv-worker-port) + :--advertise-addr (str (name node) ":" tikv-worker-port) + :--pd-endpoints (pd-endpoints test) + :--config tikv-worker-config-file + :--data-dir tikv-worker-data-dir))) + (defn page-ready? "Fetches a status page URL on the local node, and returns true iff the page was available." @@ -326,6 +354,11 @@ [] (page-ready? (str "http://127.0.0.1:" system-db-status-port "/status"))) +(defn tikv-worker-ready? + "Is TiKV-Worker ready?" + [] + (page-ready? (str "http://127.0.0.1:" tikv-worker-port "/status"))) + (defn restart-loop* "TiDB is fragile on startup; processes love to crash if they can't complete their initial requests to network dependencies. We try to work around this by @@ -405,6 +438,14 @@ (cu/daemon-running? system-db-pid-file) :starting true :crashed))) +(defn start-wait-tikv-worker! + "Starts TiKV-Worker, waiting for its status page to come online." + [test node] + (restart-loop :tikv-worker (start-tikv-worker! test node) + (cond (tikv-worker-ready?) :ready + (cu/daemon-running? tikv-worker-pid-file) :starting + true :crashed))) + (defn stop-pd-service! [test node svc] (c/su (cu/stop-daemon! (get-in pd-services [svc :bin]) (get-in pd-services [svc :pid-file])) @@ -429,6 +470,11 @@ (cu/stop-daemon! db-bin system-db-pid-file) (cu/grepkill! db-bin))) +(defn stop-tikv-worker! [test node] + (c/su + (cu/stop-daemon! tikv-worker-bin tikv-worker-pid-file) + (cu/grepkill! tikv-worker-bin))) + (defn stop-db! [test node] (c/su (cu/stop-daemon! db-bin db-pid-file) (cu/grepkill! db-bin))) @@ -436,7 +482,8 @@ "Stops all daemons" [test node] (when (:enable-system-tidb test) - (stop-system-db! test node)) + (stop-system-db! test node) + (stop-tikv-worker! test node)) (stop-db! test node) (stop-kv! test node) (stop-pd! test node)) @@ -480,7 +527,7 @@ (when (not (cu/exists? tidb-bin-dir)) (info "Creating bin layout for TiDB tarball") (c/exec :mkdir :-p tidb-bin-dir) - (doseq [b [pd-bin kv-bin db-bin pdctl-bin]] + (doseq [b [pd-bin kv-bin db-bin pdctl-bin tikv-worker-bin]] (when (cu/exists? (str tidb-dir "/" b)) (c/exec :ln :-sf (str tidb-dir "/" b) (str tidb-bin-dir "/" b))))) @@ -561,7 +608,8 @@ (install! test node) (configure!) (when enable-system? - (configure-system-db!)) + (configure-system-db!) + (configure-tikv-worker!)) (jepsen/synchronize test 180) (try+ (start-wait-pd! test node) @@ -573,6 +621,11 @@ (start-wait-kv! test node) (jepsen/synchronize test) + ; Start tikv-worker after TiKV, before waiting for replicas + (when enable-system? + (start-wait-tikv-worker! test node) + (jepsen/synchronize test)) + ; We have to wait for every region to become totally replicated ; before starting any TiDB instance: if we start TiDB first, it ; might take 80+ minutes to converge. @@ -625,7 +678,9 @@ (:enable-system-tidb test) (into [system-db-log-file system-db-slow-file - system-db-stdout])) + system-db-stdout + tikv-worker-log-file + tikv-worker-stdout])) pd-logs (if (:pd-services test) [(get-in pd-services [:api :log-file]) (get-in pd-services [:api :stdout]) diff --git a/tidb/src/tidb/nemesis.clj b/tidb/src/tidb/nemesis.clj index 6757fdafc..9b0ea502f 100644 --- a/tidb/src/tidb/nemesis.clj +++ b/tidb/src/tidb/nemesis.clj @@ -18,7 +18,7 @@ [slingshot.slingshot :refer [try+ throw+]])) (defn process-nemesis - "A nemesis that can pause, resume, start, stop, and kill tidb, tikv, and pd." + "A nemesis that can pause, resume, start, stop, and kill tidb, tikv, tikv-worker, and pd." [] (reify nemesis/Nemesis (setup! [this test] this) @@ -27,8 +27,8 @@ (let [nodes (:nodes test) nodes (case (:f op) ; When resuming, resume all nodes - (:resume-pd :resume-kv :resume-db - :start-pd :start-kv :start-db) nodes + (:resume-pd :resume-kv :resume-db :resume-tikv-worker + :start-pd :start-kv :start-db :start-tikv-worker) nodes (take (condp > (rand) 0.4 1 0.7 2 0.85 3 0.95 4 5) (shuffle nodes))) ; If the op wants to give us nodes, that's great @@ -37,21 +37,26 @@ (c/on-nodes test nodes (fn [test node] (case (:f op) - :start-pd (db/start-pd! test node) - :start-kv (db/start-kv! test node) - :start-db (db/start-db! test node) - :kill-pd (db/stop-pd! test node) - :kill-kv (db/stop-kv! test node) - :kill-db (db/stop-db! test node) - :stop-pd (cu/signal! db/pd-bin :TERM) - :stop-kv (cu/signal! db/kv-bin :TERM) - :stop-db (cu/signal! db/db-bin :TERM) - :pause-pd (cu/signal! db/pd-bin :STOP) - :pause-kv (cu/signal! db/kv-bin :STOP) - :pause-db (cu/signal! db/db-bin :STOP) - :resume-pd (cu/signal! db/pd-bin :CONT) - :resume-kv (cu/signal! db/kv-bin :CONT) - :resume-db (cu/signal! db/db-bin :CONT))))))) + :start-pd (db/start-pd! test node) + :start-kv (db/start-kv! test node) + :start-db (db/start-db! test node) + :start-tikv-worker (db/start-tikv-worker! test node) + :kill-pd (db/stop-pd! test node) + :kill-kv (db/stop-kv! test node) + :kill-db (db/stop-db! test node) + :kill-tikv-worker (db/stop-tikv-worker! test node) + :stop-pd (cu/signal! db/pd-bin :TERM) + :stop-kv (cu/signal! db/kv-bin :TERM) + :stop-db (cu/signal! db/db-bin :TERM) + :stop-tikv-worker (cu/signal! db/tikv-worker-bin :TERM) + :pause-pd (cu/signal! db/pd-bin :STOP) + :pause-kv (cu/signal! db/kv-bin :STOP) + :pause-db (cu/signal! db/db-bin :STOP) + :pause-tikv-worker (cu/signal! db/tikv-worker-bin :STOP) + :resume-pd (cu/signal! db/pd-bin :CONT) + :resume-kv (cu/signal! db/kv-bin :CONT) + :resume-db (cu/signal! db/db-bin :CONT) + :resume-tikv-worker (cu/signal! db/tikv-worker-bin :CONT))))))) (teardown! [this test]))) @@ -227,11 +232,11 @@ "Merges together all nemeses" [n] (nemesis/compose - {#{:start-pd :start-kv :start-db - :kill-pd :kill-kv :kill-db - :stop-pd :stop-kv :stop-db - :pause-pd :pause-kv :pause-db - :resume-pd :resume-kv :resume-db} (process-nemesis) + {#{:start-pd :start-kv :start-db :start-tikv-worker + :kill-pd :kill-kv :kill-db :kill-tikv-worker + :stop-pd :stop-kv :stop-db :stop-tikv-worker + :pause-pd :pause-kv :pause-db :pause-tikv-worker + :resume-pd :resume-kv :resume-db :resume-tikv-worker} (process-nemesis) #{:shuffle-leader :del-shuffle-leader :shuffle-region :del-shuffle-region :random-merge :del-random-merge} (schedule-nemesis) @@ -338,18 +343,24 @@ (op :start-kv)) (o {:kill-db (op :kill-db)} (op :start-db)) + (o {:kill-tikv-worker (op :kill-tikv-worker)} + (op :start-tikv-worker)) (o {:stop-pd (op :stop-pd)} (op :start-pd)) (o {:stop-kv (op :stop-kv)} (op :start-kv)) (o {:stop-db (op :stop-db)} (op :start-db)) + (o {:stop-tikv-worker (op :stop-tikv-worker)} + (op :start-tikv-worker)) (o {:pause-pd (op :pause-pd)} (op :resume-pd)) (o {:pause-kv (op :pause-kv)} (op :resume-kv)) (o {:pause-db (op :pause-db)} (op :resume-db)) + (o {:pause-tikv-worker (op :pause-tikv-worker)} + (op :resume-tikv-worker)) (o {:shuffle-leader (op :shuffle-leader)} (op :del-shuffle-leader)) (o {:shuffle-region (op :shuffle-region)} @@ -383,19 +394,22 @@ [n] (->> (cond-> [] ; (:clock-skew n) (conj :reset-clock) - (:pause-pd n) (conj :resume-pd) - (:pause-kv n) (conj :resume-kv) - (:pause-db n) (conj :resume-db) - (:kill-pd n) (conj :start-pd) - (:kill-kv n) (conj :start-kv) - (:kill-db n) (conj :start-db) - (:stop-pd n) (conj :start-pd) - (:stop-kv n) (conj :start-kv) - (:stop-db n) (conj :start-db) - (:shuffle-leader n) (conj :del-shuffle-leader) - (:shuffle-region n) (conj :del-shuffle-region) - (:random-merge n) (conj :del-random-merge) - (:start-netem n) (conj :stop-netem) + (:pause-pd n) (conj :resume-pd) + (:pause-kv n) (conj :resume-kv) + (:pause-db n) (conj :resume-db) + (:pause-tikv-worker n) (conj :resume-tikv-worker) + (:kill-pd n) (conj :start-pd) + (:kill-kv n) (conj :start-kv) + (:kill-db n) (conj :start-db) + (:kill-tikv-worker n) (conj :start-tikv-worker) + (:stop-pd n) (conj :start-pd) + (:stop-kv n) (conj :start-kv) + (:stop-db n) (conj :start-db) + (:stop-tikv-worker n) (conj :start-tikv-worker) + (:shuffle-leader n) (conj :del-shuffle-leader) + (:shuffle-region n) (conj :del-shuffle-region) + (:random-merge n) (conj :del-random-merge) + (:start-netem n) (conj :stop-netem) (:enable-failpoint n) (conj :disable-failpoint) @@ -464,18 +478,21 @@ (defn expand-options "We support shorthand options in nemesis maps, like :kill, which expands to - :kill-pd, :kill-kv, and :kill-db. This function expands those." + :kill-pd, :kill-kv, :kill-db, and :kill-tikv-worker. This function expands those." [n] (cond-> n (:kill n) (assoc :kill-pd true :kill-kv true - :kill-db true) + :kill-db true + :kill-tikv-worker true) (:stop n) (assoc :stop-pd true :stop-kv true - :stop-db true) + :stop-db true + :stop-tikv-worker true) (:pause n) (assoc :pause-pd true :pause-kv true - :pause-db true) + :pause-db true + :pause-tikv-worker true) (:schedules n) (assoc :shuffle-leader true :shuffle-region true :random-merge true)