Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tidb/resources/system-tidb.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ lease = "5s"
split-table = true
keyspace-name = "SYSTEM"

[instance]
tidb_service_scope = "dxf_service"

[log]
slow-query-file = "system-slow.log"

Expand Down
7 changes: 7 additions & 0 deletions tidb/resources/tikv-worker.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# tikv-worker configuration for Jepsen tests
# Note: [dfs] section should be provided via manifest parameters (WORKER_CONFIG)
# for environment-specific S3/MinIO settings

[ia]
mem-cap = "5GB"
disk-cap = "20GB"
15 changes: 14 additions & 1 deletion tidb/src/tidb/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,15 @@
:kill-pd
:kill-kv
:kill-db
:kill-tikv-worker
:stop-pd
:stop-kv
:stop-db
:stop-tikv-worker
:pause-pd
:pause-kv
:pause-db
:pause-tikv-worker
:schedules
:shuffle-leader
:shuffle-region
Expand All @@ -181,7 +184,9 @@

(def process-faults
"Faults affecting individual processes"
[:kill-pd :kill-kv :kill-db :stop-pd :stop-kv :stop-db :pause-pd :pause-kv :pause-db])
[:kill-pd :kill-kv :kill-db :kill-tikv-worker
:stop-pd :stop-kv :stop-db :stop-tikv-worker
:pause-pd :pause-kv :pause-db :pause-tikv-worker])

(def network-faults
"Faults affecting the network"
Expand Down Expand Up @@ -266,6 +271,10 @@
:color "#E9A0CF"
:start #{:kill-db :stop-db}
:stop #{:start-db}}
{:name "kill tikv-worker"
:color "#E9C0A0"
:start #{:kill-tikv-worker :stop-tikv-worker}
:stop #{:start-tikv-worker}}
{:name "pause pd"
:color "#C5A0E9"
:start #{:pause-pd}
Expand All @@ -278,6 +287,10 @@
:color "#A6A0E9"
:start #{:pause-db}
:stop #{:resume-db}}
{:name "pause tikv-worker"
:color "#D9C0A0"
:start #{:pause-tikv-worker}
:stop #{:resume-tikv-worker}}
{:name "shuffle-leader"
:color "#A6D0E9"
:start #{:shuffle-leader}
Expand Down
63 changes: 59 additions & 4 deletions tidb/src/tidb/db.clj
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@
(def system-db-pid-file (str tidb-dir "/system-db.pid"))
(def system-db-port 14000)
(def system-db-status-port 11080)
(def tikv-worker-bin "tikv-worker")
(def tikv-worker-config-file (str tidb-dir "/tikv-worker.conf"))
(def tikv-worker-log-file (str tidb-dir "/tikv-worker.log"))
(def tikv-worker-stdout (str tidb-dir "/tikv-worker.stdout"))
(def tikv-worker-pid-file (str tidb-dir "/tikv-worker.pid"))
(def tikv-worker-data-dir (str tidb-dir "/data/tikv-worker"))
(def tikv-worker-port 19000)
(def pd-services
{:api
{:bin "pd-api"
Expand Down Expand Up @@ -133,6 +140,12 @@
(c/su (c/exec :echo (slurp (io/resource "system-tidb.conf"))
:> system-db-config-file)))

(defn configure-tikv-worker!
"Writes configuration file for tikv-worker"
[]
(c/su (c/exec :echo (slurp (io/resource "tikv-worker.toml"))
:> tikv-worker-config-file)))

(defn configure!
"Write all config files."
[]
Expand Down Expand Up @@ -298,6 +311,21 @@
:-P (str system-db-port)
:--status (str system-db-status-port))))

(defn start-tikv-worker!
"Starts the TiKV-Worker daemon"
[test node]
(c/su
(cu/start-daemon!
{:logfile tikv-worker-stdout
:pidfile tikv-worker-pid-file
:chdir tidb-dir}
(str "./bin/" tikv-worker-bin)
:--addr (str "0.0.0.0:" tikv-worker-port)
:--advertise-addr (str (name node) ":" tikv-worker-port)
:--pd-endpoints (pd-endpoints test)
:--config tikv-worker-config-file
:--data-dir tikv-worker-data-dir)))

(defn page-ready?
"Fetches a status page URL on the local node, and returns true iff the page
was available."
Expand Down Expand Up @@ -326,6 +354,11 @@
[]
(page-ready? (str "http://127.0.0.1:" system-db-status-port "/status")))

(defn tikv-worker-ready?
"Is TiKV-Worker ready?"
[]
(page-ready? (str "http://127.0.0.1:" tikv-worker-port "/status")))

(defn restart-loop*
"TiDB is fragile on startup; processes love to crash if they can't complete
their initial requests to network dependencies. We try to work around this by
Expand Down Expand Up @@ -405,6 +438,14 @@
(cu/daemon-running? system-db-pid-file) :starting
true :crashed)))

(defn start-wait-tikv-worker!
"Starts TiKV-Worker, waiting for its status page to come online."
[test node]
(restart-loop :tikv-worker (start-tikv-worker! test node)
(cond (tikv-worker-ready?) :ready
(cu/daemon-running? tikv-worker-pid-file) :starting
true :crashed)))

(defn stop-pd-service! [test node svc]
(c/su
(cu/stop-daemon! (get-in pd-services [svc :bin]) (get-in pd-services [svc :pid-file]))
Expand All @@ -429,14 +470,20 @@
(cu/stop-daemon! db-bin system-db-pid-file)
(cu/grepkill! db-bin)))

(defn stop-tikv-worker! [test node]
(c/su
(cu/stop-daemon! tikv-worker-bin tikv-worker-pid-file)
(cu/grepkill! tikv-worker-bin)))

(defn stop-db! [test node] (c/su (cu/stop-daemon! db-bin db-pid-file)
(cu/grepkill! db-bin)))

(defn stop!
"Stops all daemons"
[test node]
(when (:enable-system-tidb test)
(stop-system-db! test node))
(stop-system-db! test node)
(stop-tikv-worker! test node))
(stop-db! test node)
(stop-kv! test node)
(stop-pd! test node))
Expand Down Expand Up @@ -480,7 +527,7 @@
(when (not (cu/exists? tidb-bin-dir))
(info "Creating bin layout for TiDB tarball")
(c/exec :mkdir :-p tidb-bin-dir)
(doseq [b [pd-bin kv-bin db-bin pdctl-bin]]
(doseq [b [pd-bin kv-bin db-bin pdctl-bin tikv-worker-bin]]
(when (cu/exists? (str tidb-dir "/" b))
(c/exec :ln :-sf (str tidb-dir "/" b)
(str tidb-bin-dir "/" b)))))
Expand Down Expand Up @@ -561,7 +608,8 @@
(install! test node)
(configure!)
(when enable-system?
(configure-system-db!))
(configure-system-db!)
(configure-tikv-worker!))
(jepsen/synchronize test 180)

(try+ (start-wait-pd! test node)
Expand All @@ -573,6 +621,11 @@
(start-wait-kv! test node)
(jepsen/synchronize test)

; Start tikv-worker after TiKV, before waiting for replicas
(when enable-system?
(start-wait-tikv-worker! test node)
(jepsen/synchronize test))

; We have to wait for every region to become totally replicated
; before starting any TiDB instance: if we start TiDB first, it
; might take 80+ minutes to converge.
Expand Down Expand Up @@ -625,7 +678,9 @@
(:enable-system-tidb test)
(into [system-db-log-file
system-db-slow-file
system-db-stdout]))
system-db-stdout
tikv-worker-log-file
tikv-worker-stdout]))
pd-logs (if (:pd-services test)
[(get-in pd-services [:api :log-file])
(get-in pd-services [:api :stdout])
Expand Down
97 changes: 57 additions & 40 deletions tidb/src/tidb/nemesis.clj
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
[slingshot.slingshot :refer [try+ throw+]]))

(defn process-nemesis
"A nemesis that can pause, resume, start, stop, and kill tidb, tikv, and pd."
"A nemesis that can pause, resume, start, stop, and kill tidb, tikv, tikv-worker, and pd."
[]
(reify nemesis/Nemesis
(setup! [this test] this)
Expand All @@ -27,8 +27,8 @@
(let [nodes (:nodes test)
nodes (case (:f op)
; When resuming, resume all nodes
(:resume-pd :resume-kv :resume-db
:start-pd :start-kv :start-db) nodes
(:resume-pd :resume-kv :resume-db :resume-tikv-worker
:start-pd :start-kv :start-db :start-tikv-worker) nodes

(take (condp > (rand) 0.4 1 0.7 2 0.85 3 0.95 4 5) (shuffle nodes)))
; If the op wants to give us nodes, that's great
Expand All @@ -37,21 +37,26 @@
(c/on-nodes test nodes
(fn [test node]
(case (:f op)
:start-pd (db/start-pd! test node)
:start-kv (db/start-kv! test node)
:start-db (db/start-db! test node)
:kill-pd (db/stop-pd! test node)
:kill-kv (db/stop-kv! test node)
:kill-db (db/stop-db! test node)
:stop-pd (cu/signal! db/pd-bin :TERM)
:stop-kv (cu/signal! db/kv-bin :TERM)
:stop-db (cu/signal! db/db-bin :TERM)
:pause-pd (cu/signal! db/pd-bin :STOP)
:pause-kv (cu/signal! db/kv-bin :STOP)
:pause-db (cu/signal! db/db-bin :STOP)
:resume-pd (cu/signal! db/pd-bin :CONT)
:resume-kv (cu/signal! db/kv-bin :CONT)
:resume-db (cu/signal! db/db-bin :CONT)))))))
:start-pd (db/start-pd! test node)
:start-kv (db/start-kv! test node)
:start-db (db/start-db! test node)
:start-tikv-worker (db/start-tikv-worker! test node)
:kill-pd (db/stop-pd! test node)
:kill-kv (db/stop-kv! test node)
:kill-db (db/stop-db! test node)
:kill-tikv-worker (db/stop-tikv-worker! test node)
:stop-pd (cu/signal! db/pd-bin :TERM)
:stop-kv (cu/signal! db/kv-bin :TERM)
:stop-db (cu/signal! db/db-bin :TERM)
:stop-tikv-worker (cu/signal! db/tikv-worker-bin :TERM)
:pause-pd (cu/signal! db/pd-bin :STOP)
:pause-kv (cu/signal! db/kv-bin :STOP)
:pause-db (cu/signal! db/db-bin :STOP)
:pause-tikv-worker (cu/signal! db/tikv-worker-bin :STOP)
:resume-pd (cu/signal! db/pd-bin :CONT)
:resume-kv (cu/signal! db/kv-bin :CONT)
:resume-db (cu/signal! db/db-bin :CONT)
:resume-tikv-worker (cu/signal! db/tikv-worker-bin :CONT)))))))

(teardown! [this test])))

Expand Down Expand Up @@ -227,11 +232,11 @@
"Merges together all nemeses"
[n]
(nemesis/compose
{#{:start-pd :start-kv :start-db
:kill-pd :kill-kv :kill-db
:stop-pd :stop-kv :stop-db
:pause-pd :pause-kv :pause-db
:resume-pd :resume-kv :resume-db} (process-nemesis)
{#{:start-pd :start-kv :start-db :start-tikv-worker
:kill-pd :kill-kv :kill-db :kill-tikv-worker
:stop-pd :stop-kv :stop-db :stop-tikv-worker
:pause-pd :pause-kv :pause-db :pause-tikv-worker
:resume-pd :resume-kv :resume-db :resume-tikv-worker} (process-nemesis)
#{:shuffle-leader :del-shuffle-leader
:shuffle-region :del-shuffle-region
:random-merge :del-random-merge} (schedule-nemesis)
Expand Down Expand Up @@ -338,18 +343,24 @@
(op :start-kv))
(o {:kill-db (op :kill-db)}
(op :start-db))
(o {:kill-tikv-worker (op :kill-tikv-worker)}
(op :start-tikv-worker))
(o {:stop-pd (op :stop-pd)}
(op :start-pd))
(o {:stop-kv (op :stop-kv)}
(op :start-kv))
(o {:stop-db (op :stop-db)}
(op :start-db))
(o {:stop-tikv-worker (op :stop-tikv-worker)}
(op :start-tikv-worker))
(o {:pause-pd (op :pause-pd)}
(op :resume-pd))
(o {:pause-kv (op :pause-kv)}
(op :resume-kv))
(o {:pause-db (op :pause-db)}
(op :resume-db))
(o {:pause-tikv-worker (op :pause-tikv-worker)}
(op :resume-tikv-worker))
(o {:shuffle-leader (op :shuffle-leader)}
(op :del-shuffle-leader))
(o {:shuffle-region (op :shuffle-region)}
Expand Down Expand Up @@ -383,19 +394,22 @@
[n]
(->> (cond-> []
; (:clock-skew n) (conj :reset-clock)
(:pause-pd n) (conj :resume-pd)
(:pause-kv n) (conj :resume-kv)
(:pause-db n) (conj :resume-db)
(:kill-pd n) (conj :start-pd)
(:kill-kv n) (conj :start-kv)
(:kill-db n) (conj :start-db)
(:stop-pd n) (conj :start-pd)
(:stop-kv n) (conj :start-kv)
(:stop-db n) (conj :start-db)
(:shuffle-leader n) (conj :del-shuffle-leader)
(:shuffle-region n) (conj :del-shuffle-region)
(:random-merge n) (conj :del-random-merge)
(:start-netem n) (conj :stop-netem)
(:pause-pd n) (conj :resume-pd)
(:pause-kv n) (conj :resume-kv)
(:pause-db n) (conj :resume-db)
(:pause-tikv-worker n) (conj :resume-tikv-worker)
(:kill-pd n) (conj :start-pd)
(:kill-kv n) (conj :start-kv)
(:kill-db n) (conj :start-db)
(:kill-tikv-worker n) (conj :start-tikv-worker)
(:stop-pd n) (conj :start-pd)
(:stop-kv n) (conj :start-kv)
(:stop-db n) (conj :start-db)
(:stop-tikv-worker n) (conj :start-tikv-worker)
(:shuffle-leader n) (conj :del-shuffle-leader)
(:shuffle-region n) (conj :del-shuffle-region)
(:random-merge n) (conj :del-random-merge)
(:start-netem n) (conj :stop-netem)

(:enable-failpoint n)
(conj :disable-failpoint)
Expand Down Expand Up @@ -464,18 +478,21 @@

(defn expand-options
"We support shorthand options in nemesis maps, like :kill, which expands to
:kill-pd, :kill-kv, and :kill-db. This function expands those."
:kill-pd, :kill-kv, :kill-db, and :kill-tikv-worker. This function expands those."
[n]
(cond-> n
(:kill n) (assoc :kill-pd true
:kill-kv true
:kill-db true)
:kill-db true
:kill-tikv-worker true)
(:stop n) (assoc :stop-pd true
:stop-kv true
:stop-db true)
:stop-db true
:stop-tikv-worker true)
(:pause n) (assoc :pause-pd true
:pause-kv true
:pause-db true)
:pause-db true
:pause-tikv-worker true)
(:schedules n) (assoc :shuffle-leader true
:shuffle-region true
:random-merge true)
Expand Down