koboldcpp/koboldcpp.py at concedo · demandcluster/koboldcpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# KoboldCpp is an easy-to-use AI text-generation software for GGML models.
# It's a single self contained distributable from Concedo, that builds off llama.cpp,
# and adds a versatile Kobold API endpoint, additional format support,
# backward compatibility, as well as a fancy UI with persistent stories,
# editing tools, save formats, memory, world info, author's note, characters,
# scenarios and everything Kobold and KoboldAI Lite have to offer.

import os
try:
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # try set GPU to PCI order first thing
except Exception:
    pass
import copy
import ctypes
import multiprocessing
import math
import re
import argparse
import platform
import base64
import struct
import json
import sys
import http.server
import time
import asyncio
import socket
import threading
import html
import random
import hashlib
import urllib.parse
import urllib.request
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from typing import Tuple
import shutil
import subprocess
import gzip

# constants
sampler_order_max = 7
tensor_split_max = 16
images_max = 8
audio_max = 4
bias_min_value = -100.0
bias_max_value = 100.0
logprobs_max = 10
default_draft_amount = 8
default_ttsmaxlen = 4096
default_visionmaxres = 1024
net_save_slots = 12
savestate_limit_default = 5
savestate_limit = 0 #savestate slots start at 0, only set when load model
default_vae_tile_threshold = 768
default_native_ctx = 16384
overridekv_max = 4
default_autofit_padding = 1024
lora_filenames_max = 4

# abuse prevention
stop_token_max = 256
ban_token_max = 768
logit_bias_max = 512
dry_seq_break_max = 128
extra_images_max = 4 # for kontext/qwen img

# global vars
KcppVersion = "1.108.1"
showdebug = True
kcpp_instance = None #global running instance
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}
using_gui_launcher = False

handle = None
friendlymodelname = "inactive"
friendlysdmodelname = "inactive"
friendlyembeddingsmodelname = "inactive"
lastgeneratedcomfyimg = b''
lastuploadedcomfyimg = b''
fullsdmodelpath = ""  #if empty, it's not initialized
password = "" #if empty, no auth key required
fullwhispermodelpath = "" #if empty, it's not initialized
ttsmodelpath = "" #if empty, not initialized
embeddingsmodelpath = "" #if empty, not initialized
maxctx = 8192
maxhordectx = 0 #set to whatever maxctx is if 0
maxhordelen = 1024
modelbusy = threading.Lock()
requestsinqueue = 0
ratelimitlookup = {}
defaultport = 5001
showsamplerwarning = True
showmaxctxwarning = True
showusedmemwarning = True
showmultigpuwarning = True
session_kudos_earned = 0
session_jobs = 0
session_starttime = None
exitcounter = -1
punishcounter = 0 #causes a timeout if too many errors
rewardcounter = 0 #reduces error counts for successful jobs
totalgens = 0
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
pendingabortkey = "" #if an abort is received for the non-active request, remember it (at least 1) to cancel later
args = None #global args
runmode_untouched = True
modelfile_extracted_meta = None
calulated_gpu_overhead = 0 # may be populated at runtime, can also be missing if undetected
importvars_in_progress = False
has_multiplayer = False
has_audio_support = False
has_vision_support = False
cached_chat_template = None
savedata_obj = None
mcp_connections = [] #every element is linked to one mcp source, contains obj {"client":obj, "tools":[]}
mcp_lock = threading.Lock()
multiplayer_story_data_compressed = None #stores the full compressed story of the current multiplayer session
multiplayer_turn_major = 1 # to keep track of when a client needs to sync their stories
multiplayer_turn_minor = 1
multiplayer_dataformat = "" # used to tell what is the data payload in saved story. set by client
multiplayer_lastactive = {} # timestamp of last activity for each unique player
websearch_lastquery = ""
websearch_lastresponse = []
preloaded_story = None
chatcompl_adapter = None
chatcompl_adapter_list = None #if using autoguess, will populate this will potential adapters
embedded_kailite = None
embedded_kailite_gz = None
embedded_kcpp_docs = None
embedded_kcpp_docs_gz = None
embedded_kcpp_sdui = None
embedded_kcpp_sdui_gz = None
embedded_lcpp_ui_gz = None
sslvalid = False
nocertify = False
start_time = time.time()
last_req_time = time.time()
last_non_horde_req_time = time.time()
currfinishreason = None
zenity_recent_dir = os.getcwd()
zenity_permitted = True

saved_stdout = None
saved_stderr = None
saved_stdout_py = None
saved_stderr_py = None
stdout_nullfile = None
stdout_nullfile_py = None

CUDevices = ["1","2","3","4","All"]
CUDevicesNames = ["","","","",""]
VKDevicesNames = ["","","",""]
VKIsDGPU = [0,0,0,0]
MaxMemory = [0]
MaxFreeMemory = [0]

class logit_bias(ctypes.Structure):
    _fields_ = [("token_id", ctypes.c_int32),
                ("bias", ctypes.c_float)]

class token_count_outputs(ctypes.Structure):
    _fields_ = [("count", ctypes.c_int),
                ("ids", ctypes.POINTER(ctypes.c_int))]

# returns top 5 logprobs per token
class logprob_item(ctypes.Structure):
     _fields_ = [("option_count", ctypes.c_int),
                ("selected_token", ctypes.c_char_p),
                ("selected_logprob", ctypes.c_float),
                ("selected_token_id", ctypes.c_int32),
                ("tokens", ctypes.c_char_p * logprobs_max),
                ("token_ids", ctypes.c_int32 * logprobs_max),
                ("logprobs", ctypes.POINTER(ctypes.c_float))]
class last_logprobs_outputs(ctypes.Structure):
    _fields_ = [("count", ctypes.c_int),
                ("logprob_items", ctypes.POINTER(logprob_item))]

class load_model_inputs(ctypes.Structure):
    _fields_ = [("threads", ctypes.c_int),
                ("blasthreads", ctypes.c_int),
                ("max_context_length", ctypes.c_int),
                ("low_vram", ctypes.c_bool),
                ("use_mmq", ctypes.c_bool),
                ("use_rowsplit", ctypes.c_bool),
                ("executable_path", ctypes.c_char_p),
                ("model_filename", ctypes.c_char_p),
                ("lora_filename", ctypes.c_char_p),
                ("draftmodel_filename", ctypes.c_char_p),
                ("draft_amount", ctypes.c_int),
                ("draft_gpulayers", ctypes.c_int),
                ("draft_gpusplit", ctypes.c_float * tensor_split_max),
                ("mmproj_filename", ctypes.c_char_p),
                ("mmproj_cpu", ctypes.c_bool),
                ("visionmaxres", ctypes.c_int),
                ("use_mmap", ctypes.c_bool),
                ("use_mlock", ctypes.c_bool),
                ("use_smartcontext", ctypes.c_bool),
                ("use_contextshift", ctypes.c_bool),
                ("use_fastforward", ctypes.c_bool),
                ("kcpp_main_gpu", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("batchsize", ctypes.c_int),
                ("autofit", ctypes.c_bool),
                ("autofit_tax_mb", ctypes.c_int),
                ("gpulayers", ctypes.c_int),
                ("rope_freq_scale", ctypes.c_float),
                ("rope_freq_base", ctypes.c_float),
                ("overridenativecontext", ctypes.c_int),
                ("moe_experts", ctypes.c_int),
                ("moecpu", ctypes.c_int),
                ("no_bos_token", ctypes.c_bool),
                ("load_guidance", ctypes.c_bool),
                ("override_kv", ctypes.c_char_p * overridekv_max),
                ("override_tensors", ctypes.c_char_p),
                ("flash_attention", ctypes.c_bool),
                ("tensor_split", ctypes.c_float * tensor_split_max),
                ("quant_k", ctypes.c_int),
                ("quant_v", ctypes.c_int),
                ("check_slowness", ctypes.c_bool),
                ("highpriority", ctypes.c_bool),
                ("swa_support", ctypes.c_bool),
                ("smartcache", ctypes.c_bool),
                ("smartcacheslots", ctypes.c_int),
                ("pipelineparallel", ctypes.c_bool),
                ("lora_multiplier", ctypes.c_float),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

class generation_inputs(ctypes.Structure):
    _fields_ = [("seed", ctypes.c_int),
                ("prompt", ctypes.c_char_p),
                ("memory", ctypes.c_char_p),
                ("negative_prompt", ctypes.c_char_p),
                ("guidance_scale", ctypes.c_float),
                ("images", ctypes.c_char_p * images_max),
                ("audio", ctypes.c_char_p * audio_max),
                ("max_context_length", ctypes.c_int),
                ("max_length", ctypes.c_int),
                ("temperature", ctypes.c_float),
                ("top_k", ctypes.c_int),
                ("top_a", ctypes.c_float),
                ("top_p", ctypes.c_float),
                ("min_p", ctypes.c_float),
                ("typical_p", ctypes.c_float),
                ("tfs", ctypes.c_float),
                ("nsigma", ctypes.c_float),
                ("rep_pen", ctypes.c_float),
                ("rep_pen_range", ctypes.c_int),
                ("rep_pen_slope", ctypes.c_float),
                ("presence_penalty", ctypes.c_float),
                ("mirostat", ctypes.c_int),
                ("mirostat_tau", ctypes.c_float),
                ("mirostat_eta", ctypes.c_float),
                ("xtc_threshold", ctypes.c_float),
                ("xtc_probability", ctypes.c_float),
                ("sampler_order", ctypes.c_int * sampler_order_max),
                ("sampler_len", ctypes.c_int),
                ("allow_eos_token", ctypes.c_bool),
                ("bypass_eos_token", ctypes.c_bool),
                ("tool_call_fix", ctypes.c_bool),
                ("render_special", ctypes.c_bool),
                ("stream_sse", ctypes.c_bool),
                ("grammar", ctypes.c_char_p),
                ("grammar_retain_state", ctypes.c_bool),
                ("dynatemp_range", ctypes.c_float),
                ("dynatemp_exponent", ctypes.c_float),
                ("smoothing_factor", ctypes.c_float),
                ("smoothing_curve", ctypes.c_float),
                ("adaptive_target", ctypes.c_float),
                ("adaptive_decay", ctypes.c_float),
                ("dry_multiplier", ctypes.c_float),
                ("dry_base", ctypes.c_float),
                ("dry_allowed_length", ctypes.c_int),
                ("dry_penalty_last_n", ctypes.c_int),
                ("dry_sequence_breakers_len", ctypes.c_int),
                ("dry_sequence_breakers", ctypes.POINTER(ctypes.c_char_p)),
                ("stop_sequence_len", ctypes.c_int),
                ("stop_sequence", ctypes.POINTER(ctypes.c_char_p)),
                ("logit_biases_len", ctypes.c_int),
                ("logit_biases", ctypes.POINTER(logit_bias)),
                ("banned_tokens_len", ctypes.c_int),
                ("banned_tokens", ctypes.POINTER(ctypes.c_char_p))]

class generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("stopreason", ctypes.c_int),
                ("prompt_tokens", ctypes.c_int),
                ("completion_tokens", ctypes.c_int),
                ("text", ctypes.c_char_p)]

class sd_load_model_inputs(ctypes.Structure):
    _fields_ = [("model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("kcpp_main_gpu", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("threads", ctypes.c_int),
                ("quant", ctypes.c_int),
                ("flash_attention", ctypes.c_bool),
                ("offload_cpu", ctypes.c_bool),
                ("vae_cpu", ctypes.c_bool),
                ("clip_cpu", ctypes.c_bool),
                ("diffusion_conv_direct", ctypes.c_bool),
                ("vae_conv_direct", ctypes.c_bool),
                ("taesd", ctypes.c_bool),
                ("tiled_vae_threshold", ctypes.c_int),
                ("t5xxl_filename", ctypes.c_char_p),
                ("clip1_filename", ctypes.c_char_p),
                ("clip2_filename", ctypes.c_char_p),
                ("vae_filename", ctypes.c_char_p),
                ("lora_filenames", ctypes.c_char_p * lora_filenames_max),
                ("lora_multiplier", ctypes.c_float),
                ("lora_apply_mode", ctypes.c_int),
                ("photomaker_filename", ctypes.c_char_p),
                ("upscaler_filename", ctypes.c_char_p),
                ("img_hard_limit", ctypes.c_int),
                ("img_soft_limit", ctypes.c_int),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

class sd_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("negative_prompt", ctypes.c_char_p),
                ("init_images", ctypes.c_char_p),
                ("mask", ctypes.c_char_p),
                ("extra_images_len", ctypes.c_int),
                ("extra_images", ctypes.POINTER(ctypes.c_char_p)),
                ("flip_mask", ctypes.c_bool),
                ("denoising_strength", ctypes.c_float),
                ("cfg_scale", ctypes.c_float),
                ("distilled_guidance", ctypes.c_float),
                ("shifted_timestep", ctypes.c_int),
                ("sample_steps", ctypes.c_int),
                ("width", ctypes.c_int),
                ("height", ctypes.c_int),
                ("seed", ctypes.c_int),
                ("sample_method", ctypes.c_char_p),
                ("scheduler", ctypes.c_char_p),
                ("clip_skip", ctypes.c_int),
                ("vid_req_frames", ctypes.c_int),
                ("video_output_type", ctypes.c_int),
                ("remove_limits", ctypes.c_bool),
                ("circular_x", ctypes.c_bool),
                ("circular_y", ctypes.c_bool),
                ("upscale", ctypes.c_bool)]

class sd_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("animated", ctypes.c_int),
                ("data", ctypes.c_char_p),
                ("data_extra", ctypes.c_char_p)]

class sd_upscale_inputs(ctypes.Structure):
    _fields_ = [("init_images", ctypes.c_char_p),
                ("upscaling_resize", ctypes.c_int)]

class sd_info_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("data", ctypes.c_char_p)]

class whisper_load_model_inputs(ctypes.Structure):
    _fields_ = [("model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("kcpp_main_gpu", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

class whisper_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("audio_data", ctypes.c_char_p),
                ("suppress_non_speech", ctypes.c_bool),
                ("langcode", ctypes.c_char_p)]

class whisper_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("data", ctypes.c_char_p)]

class tts_load_model_inputs(ctypes.Structure):
    _fields_ = [("threads", ctypes.c_int),
                ("ttc_model_filename", ctypes.c_char_p),
                ("cts_model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("kcpp_main_gpu", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("gpulayers", ctypes.c_int),
                ("flash_attention", ctypes.c_bool),
                ("ttsmaxlen", ctypes.c_int),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

class tts_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("speaker_seed", ctypes.c_int),
                ("audio_seed", ctypes.c_int),
                ("custom_speaker_voice", ctypes.c_char_p),
                ("custom_speaker_text", ctypes.c_char_p),
                ("custom_speaker_data", ctypes.c_char_p)]

class tts_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("data", ctypes.c_char_p)]

class embeddings_load_model_inputs(ctypes.Structure):
    _fields_ = [("threads", ctypes.c_int),
                ("model_filename", ctypes.c_char_p),
                ("executable_path", ctypes.c_char_p),
                ("kcpp_main_gpu", ctypes.c_int),
                ("vulkan_info", ctypes.c_char_p),
                ("gpulayers", ctypes.c_int),
                ("flash_attention", ctypes.c_bool),
                ("use_mmap", ctypes.c_bool),
                ("embeddingsmaxctx", ctypes.c_int),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

class embeddings_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("truncate", ctypes.c_bool)]

class embeddings_generation_outputs(ctypes.Structure):
    _fields_ = [("status", ctypes.c_int),
                ("count", ctypes.c_int),
                ("data", ctypes.c_char_p)]

class StdoutRedirector:
    def __init__(self, writer):
        self.writer = writer
        self.terminal = sys.__stdout__
    def write(self, message):
        try:
            # Always write to terminal, then duplicate to pipe writer
            self.terminal.write(message)
            self.terminal.flush()
            if self.writer:
                try:
                    self.writer.write(message)
                    self.writer.flush()
                except Exception:
                    self.writer = None
        except Exception:
            pass
    def flush(self):
        self.terminal.flush()

class MCPStdioClient:
    def resolve_command(self, command):
        resolved = shutil.which(command)
        if resolved:
            return resolved
        return command # fallback

    def __init__(self,command,largs,env=None,cwd=None):
        if isinstance(command, str):
            command = self.resolve_command(command)
            cmd = [command]
        else:
            cmd = list(command)
        if largs:
            cmd.extend(largs)
        full_env = os.environ.copy()
        if env:
            full_env.update(env)
        self.process = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            env=full_env,
            cwd=cwd
        )
        self.lock = threading.Lock()
        self.stderr_buffer = []
        self.stderr_limit = 20
        self.alive = True
        self.stderr_thread = threading.Thread(
            target=self._read_stderr,
            daemon=True
        )
        self.stderr_thread.start()
    def _read_stderr(self):
        try:
            for line in self.process.stderr:
                if not line:
                    break
                line = line.rstrip()
                self.stderr_buffer.append(line)
                if len(self.stderr_buffer) > self.stderr_limit:
                    self.stderr_buffer.pop(0)
        finally:
            self.alive = False

    def send(self, message: dict, await_response=True) -> dict: # Send JSON-RPC request and wait for one response.
        line = json.dumps(message)
        with self.lock:
            if self.process.stdin.closed:
                raise RuntimeError("MCP server stdin is closed")
            self.process.stdin.write(line + "\n")
            self.process.stdin.flush()
            if not await_response:
                return None
            response = self.process.stdout.readline()
        if not response:
            errmsg = "\n".join(self.stderr_buffer[-10:])
            print(f"[MCP Server Error!]\n{errmsg}")
            raise RuntimeError("MCP server closed stdout")
        return json.loads(response)
    def notify(self, message: dict) -> None: # Send JSON-RPC notification (no response expected).
        line = json.dumps(message)
        with self.lock:
            if self.process.stdin.closed:
                raise RuntimeError("MCP server stdin is closed")
            self.process.stdin.write(line + "\n")
            self.process.stdin.flush()
    def terminate(self):
        self.process.terminate()

class MCPHTTPClient:
    def __init__(self, url, headers=None, timeout=60.0):
        global nocertify
        self.url = url
        self.headers = {"Content-Type": "application/json","Accept": "application/json, text/event-stream"}
        if headers:
            self.headers.update(headers)
        self.timeout = timeout
        ssl_cert_dir = os.environ.get('SSL_CERT_DIR')
        if not ssl_cert_dir and not nocertify and os.name != 'nt':
            os.environ['SSL_CERT_DIR'] = '/etc/ssl/certs'

    def _read_sse(self, response) -> bytes:
        json_events = []
        buf = []
        for raw in response:
            line = raw.decode("utf-8", errors="replace").rstrip("\n")
            if not line: # end of SSE event
                if buf:
                    payload = "\n".join(buf)
                    if payload and payload[0] in "{[":
                        json_events.append(payload)
                    buf = []
                continue
            if line.startswith(":"):
                continue
            if line.startswith("data:"):
                buf.append(line[5:].lstrip())
        if buf: # flush last event
            payload = "\n".join(buf)
            if payload and payload[0] in "{[":
                json_events.append(payload)
        if not json_events:
            raise RuntimeError("MCP HTTP server returned no JSON SSE response")
        return json_events[-1].encode("utf-8")


    def send(self, message: dict, await_response=True) -> dict: # Send JSON-RPC request and return response.
        data = json.dumps(message).encode("utf-8")
        req = urllib.request.Request(self.url, data=data, headers=self.headers, method="POST")
        try:
            with urllib.request.urlopen(req, timeout=self.timeout) as response:
                sid = response.headers.get("MCP-Session-Id","92604d65-d82c-468a-96e9-cf4463ba68fc")
                if sid:
                    self.headers["MCP-Session-Id"] = sid
                ctype = response.headers.get("Content-Type","")
                body = self._read_sse(response) if "text/event-stream" in ctype else response.read()
        except urllib.error.HTTPError as e: # HTTP error with possible body
            error_body = e.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"MCP HTTP error {e.code}: {error_body}") from e
        except urllib.error.URLError as e:
            raise RuntimeError(f"MCP HTTP connection failed: {e.reason}") from e
        if not await_response:
            return None
        if not body:
            raise RuntimeError("MCP HTTP server returned empty response")
        try:
            return json.loads(body.decode("utf-8"))
        except json.JSONDecodeError as e:
            raise RuntimeError(f"MCP HTTP server returned invalid JSON: {body!r}") from e

    def notify(self, message: dict) -> None: # Send JSON-RPC notification (no response expected).
        data = json.dumps(message).encode("utf-8")
        req = urllib.request.Request(self.url,data=data,headers=self.headers,method="POST")
        try:
            with urllib.request.urlopen(req, timeout=self.timeout):
                pass
        except urllib.error.HTTPError as e: # Notifications may still return 204/empty; HTTPError means failure
            error_body = e.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"MCP HTTP notification failed ({e.code}): {error_body}") from e
        except urllib.error.URLError as e:
            raise RuntimeError(f"MCP HTTP notification connection failed: {e.reason}") from e


def getdirpath():
    return os.path.dirname(os.path.realpath(__file__))
def getabspath():
    return os.path.dirname(os.path.abspath(__file__))
def file_exists(filename):
    return os.path.exists(os.path.join(getdirpath(), filename))

def suppress_stdout():
    global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
    if not saved_stdout and not saved_stderr and not saved_stdout_py and not saved_stderr_py and not stdout_nullfile and not stdout_nullfile_py:
        sys.stdout.flush()
        sys.stderr.flush()
        saved_stdout = os.dup(sys.stdout.fileno())
        saved_stderr = os.dup(sys.stderr.fileno())
        saved_stderr_py = sys.stderr
        saved_stdout_py = sys.stdout
        stdout_nullfile = os.open(os.devnull, os.O_WRONLY)
        stdout_nullfile_py = open(os.devnull, 'w')
        os.dup2(stdout_nullfile, sys.stdout.fileno())
        os.dup2(stdout_nullfile, sys.stderr.fileno())
        sys.stderr = sys.stdout = stdout_nullfile_py

def restore_stdout():
    global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
    if saved_stdout and saved_stderr and saved_stdout_py and saved_stderr_py and stdout_nullfile and stdout_nullfile_py:
        sys.stdout = saved_stdout_py
        sys.stderr = saved_stderr_py
        os.dup2(saved_stdout, sys.stdout.fileno())
        os.dup2(saved_stderr, sys.stderr.fileno())
        os.close(stdout_nullfile)
        stdout_nullfile_py.close()
        os.close(saved_stdout)
        os.close(saved_stderr)
        saved_stdout = saved_stderr = saved_stdout_py = saved_stderr_py = stdout_nullfile = stdout_nullfile_py = None

def get_default_threads():
    physical_core_limit = 1
    if os.cpu_count() is not None and os.cpu_count()>1:
        physical_core_limit = os.cpu_count() // 2
    default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
    processor = platform.processor()
    if 'Intel' in processor:
        default_threads = (8 if default_threads > 8 else default_threads) #this helps avoid e-cores.
    if default_threads > 64:
        print(f"Auto CPU Threads capped at 64 (instead of {default_threads}). You can override this by passing an explicit number of --threads.")
        default_threads = 64
    return default_threads

def pick_existant_file(ntoption,nonntoption):
    precompiled_prefix = "precompiled_"
    ntexist = file_exists(ntoption)
    nonntexist = file_exists(nonntoption)
    precompiled_ntexist = file_exists(precompiled_prefix+ntoption)
    precompiled_nonntexist = file_exists(precompiled_prefix+nonntoption)
    if os.name == 'nt':
        if not ntexist and precompiled_ntexist:
            return (precompiled_prefix+ntoption)
        if nonntexist and not ntexist:
            return nonntoption
        return ntoption
    else:
        if not nonntexist and precompiled_nonntexist:
            return (precompiled_prefix+nonntoption)
        if ntexist and not nonntexist:
            return ntoption
        return nonntoption

lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so")
lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
lib_vulkan_failsafe = pick_existant_file("koboldcpp_vulkan_failsafe.dll","koboldcpp_vulkan_failsafe.so")
lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
lib_hipblas = pick_existant_file("koboldcpp_hipblas.dll","koboldcpp_hipblas.so")
lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so")
lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so")
libname = ""
lib_option_pairs = [
    (lib_default, "Use CPU"),
    (lib_cublas, "Use CUDA"),
    (lib_hipblas, "Use hipBLAS (ROCm)"),
    (lib_vulkan, "Use Vulkan"),
    (lib_noavx2, "Use CPU (Old CPU)"),
    (lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
    (lib_vulkan_failsafe, "Use Vulkan (Older CPU)"),
    (lib_failsafe, "Failsafe Mode (Older CPU)")]
default_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, vulkan_noavx2_option, vulkan_failsafe_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]

def init_library():
    global handle, args, libname
    global lib_default,lib_failsafe,lib_noavx2,lib_vulkan_failsafe,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2

    libname = lib_default

    if args.noavx2: #failsafe implies noavx2 always
        if args.failsafe and (args.usevulkan is not None) and file_exists(lib_vulkan_failsafe):
            libname = lib_vulkan_failsafe
        elif (args.usevulkan is not None) and file_exists(lib_vulkan_noavx2):
            libname = lib_vulkan_noavx2
        elif (args.failsafe) and file_exists(lib_failsafe):
            print("!!! Attempting to use FAILSAFE MODE !!!")
            libname = lib_failsafe
        elif file_exists(lib_noavx2):
            libname = lib_noavx2
    elif (args.usecuda is not None):
        if file_exists(lib_cublas):
            libname = lib_cublas
        elif file_exists(lib_hipblas):
            libname = lib_hipblas
    elif (args.usevulkan is not None):
        if file_exists(lib_vulkan):
            libname = lib_vulkan
        elif file_exists(lib_vulkan_noavx2):
            libname = lib_vulkan_noavx2
    elif libname == lib_default and not file_exists(lib_default) and file_exists(lib_noavx2):
        libname = lib_noavx2

    print("Initializing dynamic library: " + libname)
    dir_path = getdirpath()
    abs_path = getabspath()

    #add all potential paths
    if os.name=='nt':
        os.add_dll_directory(dir_path)
        os.add_dll_directory(abs_path)
        os.add_dll_directory(os.getcwd())
        if libname == lib_cublas and "CUDA_PATH" in os.environ:
            newpath = os.path.join(os.environ["CUDA_PATH"], "bin")
            if os.path.exists(newpath):
                os.add_dll_directory(newpath)
        if libname == lib_hipblas and "HIP_PATH" in os.environ:
            newpath = os.path.join(os.environ["HIP_PATH"], "bin")
            if os.path.exists(newpath):
                os.add_dll_directory(newpath)

    handle = ctypes.CDLL(os.path.join(dir_path, libname))

    handle.load_model.argtypes = [load_model_inputs]
    handle.load_model.restype = ctypes.c_bool
    handle.generate.argtypes = [generation_inputs]
    handle.generate.restype = generation_outputs
    handle.new_token.restype = ctypes.c_char_p
    handle.new_token.argtypes = [ctypes.c_int]
    handle.get_stream_count.restype = ctypes.c_int
    handle.has_finished.restype = ctypes.c_bool
    handle.has_audio_support.restype = ctypes.c_bool
    handle.has_vision_support.restype = ctypes.c_bool
    handle.get_last_eval_time.restype = ctypes.c_float
    handle.get_last_process_time.restype = ctypes.c_float
    handle.get_last_token_count.restype = ctypes.c_int
    handle.get_last_input_count.restype = ctypes.c_int
    handle.get_last_seed.restype = ctypes.c_int
    handle.get_last_draft_success.restype = ctypes.c_int
    handle.get_last_draft_failed.restype = ctypes.c_int
    handle.get_total_img_gens.restype = ctypes.c_int
    handle.get_total_tts_gens.restype = ctypes.c_int
    handle.get_total_transcribe_gens.restype = ctypes.c_int
    handle.get_total_gens.restype = ctypes.c_int
    handle.get_last_stop_reason.restype = ctypes.c_int
    handle.abort_generate.restype = ctypes.c_bool
    handle.token_count.restype = token_count_outputs
    handle.get_pending_output.restype = ctypes.c_char_p
    handle.get_chat_template.restype = ctypes.c_char_p
    handle.calc_new_state_kv.restype = ctypes.c_size_t
    handle.calc_new_state_tokencount.restype = ctypes.c_size_t
    handle.calc_old_state_kv.argtypes = [ctypes.c_int]
    handle.calc_old_state_kv.restype = ctypes.c_size_t
    handle.calc_old_state_tokencount.argtypes = [ctypes.c_int]
    handle.calc_old_state_tokencount.restype = ctypes.c_size_t
    handle.save_state_kv.argtypes = [ctypes.c_int]
    handle.save_state_kv.restype = ctypes.c_size_t
    handle.load_state_kv.argtypes = [ctypes.c_int]
    handle.load_state_kv.restype = ctypes.c_bool
    handle.clear_state_kv.restype = ctypes.c_bool
    handle.sd_load_model.argtypes = [sd_load_model_inputs]
    handle.sd_load_model.restype = ctypes.c_bool
    handle.sd_generate.argtypes = [sd_generation_inputs]
    handle.sd_generate.restype = sd_generation_outputs
    handle.sd_upscale.argtypes = [sd_upscale_inputs]
    handle.sd_upscale.restype = sd_generation_outputs
    handle.sd_get_info.argtypes = []
    handle.sd_get_info.restype = sd_info_outputs
    handle.whisper_load_model.argtypes = [whisper_load_model_inputs]
    handle.whisper_load_model.restype = ctypes.c_bool
    handle.whisper_generate.argtypes = [whisper_generation_inputs]
    handle.whisper_generate.restype = whisper_generation_outputs
    handle.tts_load_model.argtypes = [tts_load_model_inputs]
    handle.tts_load_model.restype = ctypes.c_bool
    handle.tts_generate.argtypes = [tts_generation_inputs]
    handle.tts_generate.restype = tts_generation_outputs
    handle.embeddings_load_model.argtypes = [embeddings_load_model_inputs]
    handle.embeddings_load_model.restype = ctypes.c_bool
    handle.embeddings_generate.argtypes = [embeddings_generation_inputs]
    handle.embeddings_generate.restype = embeddings_generation_outputs
    handle.last_logprobs.restype = last_logprobs_outputs
    handle.detokenize.argtypes = [token_count_outputs]
    handle.detokenize.restype = ctypes.c_char_p

def set_backend_props(inputs):
    # we must force an explicit tensor split
    # otherwise the default will divide equally and multigpu crap will slow it down badly
    inputs.kcpp_main_gpu = 0
    if(args.maingpu is not None and args.maingpu>=0):
        inputs.kcpp_main_gpu = args.maingpu

    if args.usecuda:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if not args.tensor_split:
        if (args.usecuda and "0" in args.usecuda):
            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
            os.environ["HIP_VISIBLE_DEVICES"] = "0"
            inputs.kcpp_main_gpu = 0
        elif (args.usecuda and "1" in args.usecuda):
            os.environ["CUDA_VISIBLE_DEVICES"] = "1"
            os.environ["HIP_VISIBLE_DEVICES"] = "1"
            inputs.kcpp_main_gpu = 0
        elif (args.usecuda and "2" in args.usecuda):
            os.environ["CUDA_VISIBLE_DEVICES"] = "2"
            os.environ["HIP_VISIBLE_DEVICES"] = "2"
            inputs.kcpp_main_gpu = 0
        elif (args.usecuda and "3" in args.usecuda):
            os.environ["CUDA_VISIBLE_DEVICES"] = "3"
            os.environ["HIP_VISIBLE_DEVICES"] = "3"
            inputs.kcpp_main_gpu = 0
    else:
        if(args.maingpu is None or args.maingpu<0):
            if (args.usecuda and "0" in args.usecuda):
                inputs.kcpp_main_gpu = 0
            elif (args.usecuda and "1" in args.usecuda):
                inputs.kcpp_main_gpu = 1
            elif (args.usecuda and "2" in args.usecuda):
                inputs.kcpp_main_gpu = 2
            elif (args.usecuda and "3" in args.usecuda):
                inputs.kcpp_main_gpu = 3

    if args.usevulkan: #is an empty array if using vulkan without defined gpu
        s = ""
        for it in range(0,len(args.usevulkan)):
            s += str(args.usevulkan[it])
        inputs.vulkan_info = s.encode("UTF-8")
    else:
        inputs.vulkan_info = "".encode("UTF-8")

    # set universal flags
    inputs.devices_override = (args.device if args.device else "").encode("UTF-8")
    inputs.quiet = args.quiet
    inputs.debugmode = args.debugmode
    inputs.executable_path = (getdirpath()+"/").encode("UTF-8")

    return inputs

def end_trim_to_sentence(input_text):
    enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…']
    last = -1
    for ender in enders:
        last = max(last, input_text.rfind(ender))
    nl = input_text.rfind("\n")
    last = max(last, nl)
    if last > 0:
        return input_text[:last + 1].strip()
    return input_text.strip()

def tryparseint(value,fallback):
    if value is None:
        return fallback
    if isinstance(value, str):
        lower_value = value.lower()
        if lower_value == "true":
            return 1
        if lower_value == "false":
            return 0
    try:
        return int(value)
    except ValueError:
        return fallback
def tryparsefloat(value,fallback):
    if value is None:
        return fallback
    try:
        return float(value)
    except ValueError:
        return fallback

def replace_last_in_string(text: str, match: str, replacement: str) -> str:
    if match == "":
        return text
    head, sep, tail = text.rpartition(match)
    if sep == "":
        return text  # old not found
    return head + replacement + tail

def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE sequences, corrupted ones will be ignored.
    try:
        byte_seq.decode('utf-8')
        return False  # Valid UTF-8
    except UnicodeDecodeError as e:
        if e.reason == 'unexpected end of data':
            return True #incomplete sequence
        return False #invalid sequence, but not incomplete

def strip_base64_prefix(encoded_data):
    if not encoded_data:
        return ""
    if encoded_data.startswith("data:image"):
        encoded_data = encoded_data.split(',', 1)[-1]
    return encoded_data

def old_cpu_check(): #return -1 for pass, 0 if has avx2, 1 if has avx, 2 if has nothing
    shouldcheck = ((sys.platform == "linux" and platform.machine().lower() in ("x86_64", "amd64")) or
                  (os.name == 'nt' and platform.machine().lower() in ("amd64", "x86_64")))
    if not shouldcheck:
        return -1 #doesnt deal with avx at all.
    try:
        retflags = 0
        if sys.platform == "linux":
            with open('/proc/cpuinfo', 'r') as f:
                cpuinfo = f.read()
                cpuinfo = cpuinfo.lower()
                if 'avx' not in cpuinfo and 'avx2' not in cpuinfo:
                    retflags = 2
                elif 'avx2' not in cpuinfo:
                    retflags = 1
        elif os.name == 'nt':
            basepath = os.path.abspath(os.path.dirname(__file__))
            output = ""
            data = None
            output = subprocess.run([os.path.join(basepath, "simplecpuinfo.exe")], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8', timeout=6).stdout
            data = json.loads(output)
            if data["avx2"]==0 and data["avx"]==0:
                retflags = 2
            elif data["avx2"]==0:
                retflags = 1
        return retflags
    except Exception:
        return -1 #cannot determine


def unpack_to_dir(destpath = ""):
    srcpath = os.path.abspath(os.path.dirname(__file__))
    cliunpack = False if destpath == "" else True
    print("Attempt to unpack KoboldCpp into directory...")

    if not cliunpack:
        from tkinter import messagebox
        destpath = zentk_askdirectory(title='Select an empty folder to unpack KoboldCpp')
        if not destpath:
            return

    if not os.path.isdir(destpath):
        os.makedirs(destpath)

    if os.path.isdir(srcpath) and os.path.isdir(destpath) and not os.listdir(destpath):
        try:
            if cliunpack:
                print(f"KoboldCpp will be extracted to {destpath}\nThis process may take several seconds to complete.")
            else:
                messagebox.showinfo("Unpack Starting", f"KoboldCpp will be extracted to {destpath}\nThis process may take several seconds to complete.")
            pyds_dir = os.path.join(destpath, 'pyds')
            using_pyinstaller_6 = False
            try:
                import pkg_resources
                piver = pkg_resources.get_distribution("pyinstaller").version
                print(f"PyInstaller Version: {piver}")
                if piver.startswith("6."):
                    using_pyinstaller_6 = True
                    os.makedirs(os.path.join(destpath, "_internal"), exist_ok=True)
                    pyds_dir = os.path.join(os.path.join(destpath, "_internal"), 'pyds')
            except Exception:
                pass
            os.makedirs(pyds_dir, exist_ok=True)
            for item in os.listdir(srcpath):
                s = os.path.join(srcpath, item)
                d = os.path.join(destpath, item)
                d2 = d  #this will be modified for pyinstaller 6 and unmodified for pyinstaller 5
                if using_pyinstaller_6:
                    d2 = os.path.join(os.path.join(destpath, "_internal"), item)
                if using_pyinstaller_6 and item.startswith('koboldcpp-launcher'):  # Move koboldcpp-launcher to its intended location
                    shutil.copy2(s, d)
                    continue
                if item.endswith('.pyd'):  # relocate pyds files to subdirectory
                    pyd = os.path.join(pyds_dir, item)
                    shutil.copy2(s, pyd)
                    continue
                if os.path.isdir(s):
                    shutil.copytree(s, d2, False, None)
                else:
                    shutil.copy2(s, d2)
            if cliunpack:
                print(f"KoboldCpp successfully extracted to {destpath}")
            else:
                messagebox.showinfo("KoboldCpp Unpack Success", f"KoboldCpp successfully extracted to {destpath}")
        except Exception as e:
            if cliunpack:
                print(f"An error occurred while unpacking: {e}")
            else:
                messagebox.showerror("Error", f"An error occurred while unpacking: {e}")
    else:
        if cliunpack:
            print("The target folder is not empty or invalid. Please select an empty folder.")