takingstock/Clustering_SQL.py at main · mandiberg/takingstock · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#sklearn imports
import gc
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.metrics import silhouette_score
from scipy.optimize import minimize
#from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
#from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'
#from sklearn.model_selection import ParameterGrid
#from sklearn import metrics

import datetime   ####### for saving cluster analytics

from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
# my ORM
from my_declarative_base import Encodings, Detections, Base, Images, Column, Integer, String, Date, Boolean, DECIMAL, BLOB, ForeignKey, JSON, ForeignKey

from sqlalchemy.exc import IntegrityError
from sqlalchemy import create_engine, text, MetaData, Table, Column, Numeric, Integer, VARCHAR, update, Float
from sqlalchemy.exc import OperationalError
from sqlalchemy.pool import NullPool
import pymongo
from pick import pick

import numpy as np
import pandas as pd
from scipy.optimize import minimize
import os
import time
import pickle
from sys import platform

#mine
from mp_db_io import DataIO
from mp_sort_pose import SortPose
from tools_clustering import ToolsClustering

image_edge_multiplier_sm = [2.2, 2.2, 2.6, 2.2] # standard portrait
face_height_output = 500
motion = {"side_to_side": False, "forward_smile": True, "laugh": False, "forward_nosmile":  False, "static_pose":  False, "simple": False}
EXPAND = False
ONE_SHOT = False # take all files, based off the very first sort order.
JUMP_SHOT = False # jump to random file if can't find a run

'''
tracking time based on items, for speed predictions
earlier
47000, 240
100000, 695

Now
items, seconds
10000, 33
50000, 280
100000, 1030
200000, 3202
300000, 6275
1.1M, ???
'''


io = DataIO()
db = io.db
# io.db["name"] = "stock"
# io.db["name"] = "ministock"
# mongo_client = pymongo.MongoClient(io.dbmongo['host'])
# mongo_db = mongo_client[io.dbmongo['name']]
# io.mongo_db = mongo_db

# mongo_collection = mongo_db[io.dbmongo['collection']]


NUMBER_OF_PROCESSES = io.NUMBER_OF_PROCESSES
title = 'Please choose your operation: '
options = ['kmeans cluster and save clusters', 'cluster assignment', 'calculate cluster medians, cluster_dist and save clusters', 'make meta clusters']
option, MODE = pick(options, title)
# MODE = 1
# CLUSTER_TYPE = "Clusters"
# CLUSTER_TYPE = "BodyPoses"
# CLUSTER_TYPE = "BodyPoses3D" # use this for META 3D body clusters, Arms will start build but messed up because of subset landmarks
# CLUSTER_TYPE = "ArmsPoses3D"
CLUSTER_TYPE = "ObjectFusion"
# CLUSTER_TYPE = "HandsPositions"
# CLUSTER_TYPE = "HandsGestures"
# CLUSTER_TYPE = "FingertipsPositions"
# CLUSTER_TYPE = "HSV" # only works with cluster save, not with assignment
VERBOSE = True
cl = ToolsClustering(CLUSTER_TYPE, VERBOSE=VERBOSE)
# Note: session will be passed to cl after engine/session creation below

if "3D" in cl.CLUSTER_TYPE:
    if cl.CLUSTER_TYPE == "ArmsPoses3D":
        # this is a hacky way of saying I want XYZ but not Vis
        LMS_DIMENSIONS = 3
    else:
        LMS_DIMENSIONS = 4
else:
    LMS_DIMENSIONS = 3
OFFSET = 0
# SELECT MAX(cmb.image_id) FROM ImagesBodyPoses3D cmb JOIN Encodings e ON cmb.image_id = e.image_id WHERE e.is_feet = 0;
# START_ID = 129478350 # only used in MODE 1
START_ID = 0 # only used in MODE 1

# WHICH TABLE TO USE?
# SegmentTable_name = 'SegmentOct20'
SegmentTable_name = 'SegmentBig_isface'
# SegmentTable_name = 'Encodings'

# if doing MODE == 2, use SegmentHelper_name to subselect SQL query
# unless you know what you are doing, leave this as None
# SegmentHelper_name = None
# if cl.CLUSTER_TYPE == "ArmsPoses3D":
# SegmentHelper_name = 'SegmentHelper_sept2025_heft_keywords'
SegmentHelper_name = 'Detections'
# SegmentHelper_name = 'SegmentHelper_dec2025_body3D_outOfSegment'
# SegmentHelper_name = 'SegmentHelper_oct2025_every40'
FORCE_HAND_LANDMARKS = False # when doing ArmsPoses3D, default is True, so mongo_hand_landmarks = 1

# TESTING MODE - reduce dataset size for faster iteration using pre-filtered table
# Set to True to use SegmentHelper_oct2025_every40 (every 40th image, ~2.5% of full dataset)
# Set to False for production full dataset processing
SKIP_TESTING = False

# number of clusters produced. run GET_OPTIMAL_CLUSTERS and add that number here
# 32 for hand positions
# 128 for hand gestures
N_CLUSTERS = 768  # Increased from 768 - need more granularity to break up mega-clusters
N_META_CLUSTERS = 256
if MODE == 3:
    META = True
    N_CLUSTERS = N_META_CLUSTERS
    LMS_DIMENSIONS = 4
else: META = False

ONE_SHOT= JUMP_SHOT= HSV_CONTROL=  INPAINT= OBJ_CLS_ID = UPSCALE_MODEL_PATH =None
# face_height_output, image_edge_multiplier, EXPAND=False, ONE_SHOT=False, JUMP_SHOT=False, HSV_CONTROL=None, VERBOSE=True,INPAINT=False, SORT_TYPE="128d", OBJ_CLS_ID = None,UPSCALE_MODEL_PATH=None, use_3D=False
cfg = {
    'motion': motion,
    'face_height_output': face_height_output,
    'image_edge_multiplier': image_edge_multiplier_sm,
    'EXPAND': EXPAND,
    'ONE_SHOT': ONE_SHOT,
    'JUMP_SHOT': JUMP_SHOT,
    'HSV_CONTROL': HSV_CONTROL,
    'VERBOSE': VERBOSE,
    'INPAINT': INPAINT,
    'SORT_TYPE': cl.CLUSTER_TYPE,
    'OBJ_CLS_ID': OBJ_CLS_ID,
    'UPSCALE_MODEL_PATH': UPSCALE_MODEL_PATH,
    'LMS_DIMENSIONS': LMS_DIMENSIONS
}
sort = SortPose(config=cfg)
# MM you need to use conda activate mps_torch310
SUBSELECT_ONE_CLUSTER = 0

# overrides SUBSET_LANDMARKS which is now set in sort pose init
if cl.CLUSTER_TYPE == "BodyPoses3D":
    if META:
        # sort.SUBSET_LANDMARKS = sort.make_subset_landmarks(15,16)  # just wrists
        sort.SUBSET_LANDMARKS = sort.make_subset_landmarks(11,22)
        USE_SUBSET_MEDIANS = True
    # else: sort.SUBSET_LANDMARKS = None
    else:
        # setting SUBSET_LANDMARKS for to nose [0] and ears+mouth+rest of body [7-32]
        sort.SUBSET_LANDMARKS = sort.make_subset_landmarks(0,0) + sort.make_subset_landmarks(7,32)
        USE_SUBSET_MEDIANS = True
elif cl.CLUSTER_TYPE == "ArmsPoses3D":
    # print("OVERRIDE setting cl.CLUSTER_TYPE to BodyPoses3D for ArmsPoses3D subset median calculation: ",cl.CLUSTER_TYPE, sort.CLUSTER_TYPE)
    # sort.SUBSET_LANDMARKS = sort.make_subset_landmarks(0,0) + sort.make_subset_landmarks(7,22)
    sort.SUBSET_LANDMARKS = sort.make_subset_landmarks(0,22)
    USE_SUBSET_MEDIANS = True
    print("OVERRIDE after construction setting sort.SUBSET_LANDMARKS to: ",sort.SUBSET_LANDMARKS)
else:
    sort.SUBSET_LANDMARKS = None
    USE_SUBSET_MEDIANS = False
USE_HEAD_POSE = False

SHORTRANGE = False # controls a short range query for the face x,y,z and mouth gap
ANGLES = []
STRUCTURE = "list3D" # 2d "dict", 2d "list", 2d plus visibility "list3", 3d plus visibility "list3D"
print("STRUCTURE: ",STRUCTURE)
if "list3" in STRUCTURE:
    print("setting 3D to True")
    sort.use_3D = True

# this works for using segment in stock, and for ministock
USE_SEGMENT = True

# get the best fit for clusters
GET_OPTIMAL_CLUSTERS=False

SAVE_FIG=False ##### option for saving the visualized data

# TK 4 HSV
# table_cluster_type is used in SQL queries and table class defs
table_cluster_type = cl.set_table_cluster_type(META)

# setting the data column and is_feet based on cl.CLUSTER_TYPE from dict
this_data_column = cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]

if USE_SEGMENT is True and (cl.CLUSTER_TYPE != "Clusters"):
    print("setting Poses SQL")
    dupe_table_pre  = "s" # set default dupe_table_pre to s
    FROM =f"{SegmentTable_name} s " # set base FROM
    if "SegmentBig_" in SegmentTable_name:
        # handles segmentbig which doesn't have is_dupe_of, etc
        FROM += f" JOIN Encodings e ON s.image_id = e.image_id "
        dupe_table_pre = "e"
        WHERE = f" {dupe_table_pre}.is_dupe_of IS NULL AND  {dupe_table_pre}.two_noses IS NULL "
    elif "Encodings" in SegmentTable_name:
        # handles segmentbig which doesn't have is_dupe_of, etc
        # FROM += f" JOIN Encodings e ON s.image_id = e.image_id "
        dupe_table_pre = "s" # because SegmentTable_name is Encodings and gets aliased as s
        WHERE = f"  {dupe_table_pre}.is_dupe_of IS NULL AND  {dupe_table_pre}.two_noses IS NULL AND {dupe_table_pre}.is_face = 1 " # ensures we are still only using faces
    else:
        WHERE = f" {dupe_table_pre}.is_dupe_of IS NULL "
    # Basic Query, this works with SegmentOct20. Previously included s.face_x, s.face_y, s.face_z, s.mouth_gap
    SELECT = "DISTINCT(s.image_id)"

    # handle ObjectFusion, just get pitch, yaw, roll. Detections handled later:
    if cl.CLUSTER_TYPE == "ObjectFusion":
        SELECT += f" , {dupe_table_pre}.pitch, {dupe_table_pre}.yaw, {dupe_table_pre}.roll "

    if isinstance(this_data_column, list):
        if "HSV" in cl.CLUSTER_TYPE:
            SELECT = SELECT.replace(f"s.face_x, s.face_y, s.face_z, s.mouth_gap", f"ib.hue, ib.sat, ib.val ")
            FROM += f" JOIN ImagesBackground ib ON s.image_id = ib.image_id "
            WHERE += f" AND ib.hue IS NOT NULL AND ib.sat IS NOT NULL AND ib.val IS NOT NULL "
    else:
        WHERE += f" AND {dupe_table_pre}.{this_data_column} = 1 "

    if cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["is_feet"] is not None:
        WHERE += f" AND {dupe_table_pre}.is_feet = {cl.CLUSTER_DATA[cl.CLUSTER_TYPE]['is_feet']} "
    if cl.CLUSTER_DATA[cl.CLUSTER_TYPE].get("mongo_hand_landmarks") is not None and FORCE_HAND_LANDMARKS:
        WHERE += f" AND {dupe_table_pre}.mongo_hand_landmarks = {cl.CLUSTER_DATA[cl.CLUSTER_TYPE]['mongo_hand_landmarks']} "
    # refactoring the above to use the dict Oct 11
    # if cl.CLUSTER_TYPE == "BodyPoses": WHERE += f" AND {dupe_table_pre}.mongo_body_landmarks = 1 and {dupe_table_pre}.is_feet = 1"
    # # elif cl.CLUSTER_TYPE == "BodyPoses3D": WHERE += f" AND {dupe_table_pre}.mongo_body_landmarks_3D = 1 and {dupe_table_pre}.is_feet = 1"
    # elif cl.CLUSTER_TYPE == "BodyPoses3D": WHERE += f" AND {dupe_table_pre}.mongo_body_landmarks_3D = 1"
    # elif cl.CLUSTER_TYPE == "ArmsPoses3D": WHERE += f" AND {dupe_table_pre}.mongo_body_landmarks_3D = 1"
    # elif cl.CLUSTER_TYPE == "HandsGestures": WHERE += f" AND {dupe_table_pre}.mongo_hand_landmarks = 1 "
    # elif cl.CLUSTER_TYPE in ["HandsPositions","FingertipsPositions"] : WHERE += f" AND {dupe_table_pre}.mongo_hand_landmarks_norm = 1 "

    if MODE == 0:
        if SHORTRANGE: WHERE += " AND s.face_x > -35 AND s.face_x < -24 AND s.face_y > -3 AND s.face_y < 3 AND s.face_z > -3 AND s.face_z < 3 "
    # FROM += f" INNER JOIN Encodings h ON h.image_id = s.image_id "
    # FROM += f" INNER JOIN {HelperTable_name} h ON h.image_id = s.image_id "
        if SUBSELECT_ONE_CLUSTER:
            if cl.CLUSTER_TYPE == "HandsGestures": subselect_cluster = "ImagesHandsPositions"
            elif cl.CLUSTER_TYPE == "HandsPositions": subselect_cluster = "ImagesHandsGestures"
            FROM += f" INNER JOIN {subselect_cluster} sc ON sc.image_id = s.image_id "
            WHERE += f" AND sc.cluster_id = {SUBSELECT_ONE_CLUSTER} "
        if SegmentHelper_name:
            FROM += f" INNER JOIN {SegmentHelper_name} h ON h.image_id = s.image_id "
    elif MODE in (1,2):
        FROM += f" LEFT JOIN Images{table_cluster_type} ic ON s.image_id = ic.image_id"
        if MODE == 1:
            WHERE += " AND ic.cluster_id IS NULL "
            if SegmentHelper_name:
                FROM += f" INNER JOIN {SegmentHelper_name} h ON h.image_id = s.image_id "
                # WHERE += " AND h.is_body = 1"
            if START_ID:
                WHERE += f" AND s.image_id >= {START_ID} "
        elif MODE == 2:
            # if doing MODE == 2, use SegmentHelper_name to subselect SQL query
            FROM += f" INNER JOIN {table_cluster_type} c ON c.cluster_id = ic.cluster_id"
            SELECT += ",ic.cluster_id, ic.cluster_dist, c.cluster_median"
            WHERE += " AND ic.cluster_id IS NOT NULL AND ic.cluster_dist IS NULL"
    elif MODE == 3:
        # make meta clusters. redefining as a simple full select of the clusters table
        SELECT = "*"
        FROM = table_cluster_type
        WHERE = " cluster_id IS NOT NULL "

    # WHERE += " AND h.is_body = 1"
    LIMIT = 4000000
    BATCH_LIMIT = 10000

    '''
    Poses
    500 11s
    1000 21s
    2000 43s
    4000 87s
    30000 2553 @ hands elbows x 3d
    100000 90s @ HAND_LMS
    1000000 1077s @ HAND_LMS
    1100000 1177 @ HAND_LMSx 3d
    '''

elif USE_SEGMENT is True and MODE == 0:
    print("setting Poses SQL MODE 0 where using regular Clusters and ImagesClusters tables")
    # where the script is looking for files list
    # do not use this if you are using the regular Clusters and ImagesClusters tables
    SegmentTable_name = 'SegmentOct20'

    # 3.8 M large table (for Topic Model)
    HelperTable_name = "SegmentHelperMar23_headon"
    HelperTable_name = None

    # Basic Query, this works with gettytest3
    SELECT = "DISTINCT(s.image_id)"
    FROM = f"{SegmentTable_name} s"
    if HelperTable_name: FROM += f" INNER JOIN {HelperTable_name} h ON h.image_id = s.image_id "
    # WHERE = " s.mongo_body_landmarks = 1"
    WHERE = " s.mongo_face_landmarks = 1"

    # for selecting a specific topic
    FROM += f" INNER JOIN ImagesTopics it ON it.image_id = s.image_id "
    WHERE += " AND it.topic_id = 22 "
    # WHERE = "s.face_x > -33 AND s.face_x < -27 AND s.face_y > -2 AND s.face_y < 2 AND s.face_z > -2 AND s.face_z < 2"
    LIMIT = 5000

    '''
    350k 1900s
    '''

    # # join with SSD tables. Satyam, use the one below
    # SELECT = "DISTINCT(e.image_id), e.face_encodings68"
    # FROM = "Encodings e"
    # QUERY = "e.image_id IN"
    # SUBQUERY = f"(SELECT seg1.image_id FROM {SegmentTable_name} seg1 )"
    # WHERE = f"{QUERY} {SUBQUERY}"
    # LIMIT = 1000000

elif USE_SEGMENT is True and MODE == 1:
    SegmentTable_name = 'SegmentOct20'
    # Basic Query, this works with gettytest3
    SELECT = "DISTINCT(s.image_id),s.face_encodings68"
    FROM = f"{SegmentTable_name} s LEFT JOIN Images{cl.CLUSTER_TYPE} ic ON s.image_id = ic.image_id"
    WHERE = "ic.cluster_id IS NULL"
    LIMIT = "100"

else:
    # Basic Query, this works with gettytest3
    SELECT = "DISTINCT(image_id),face_encodings68"
    FROM ="encodings"
    WHERE = "face_encodings68 IS NOT NULL"
    LIMIT = 1000
    SegmentTable_name = ""

if db['unix_socket']:
    # for MM's MAMP config
    engine = create_engine("mysql+pymysql://{user}:{pw}@/{db}?unix_socket={socket}".format(
        user=db['user'], pw=db['pass'], db=db['name'], socket=db['unix_socket']
    ), pool_pre_ping=True, pool_recycle=600, poolclass=NullPool)
else:
    engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                                .format(host=db['host'], db=db['name'], user=db['user'], pw=db['pass']), pool_pre_ping=True, pool_recycle=600, poolclass=NullPool)
# metadata = MetaData(engine)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()

# Pass session to ToolsClustering instance for database access
cl.session = session

# Print startup configuration
print("\n" + "="*70)
print("CLUSTERING CONFIGURATION")
print("="*70)
print(f"MODE: {MODE} ({option})")
print(f"CLUSTER_TYPE: {cl.CLUSTER_TYPE}")
print(f"N_CLUSTERS: {N_CLUSTERS}")
if SKIP_TESTING:
    print(f"\n⚠️  TESTING MODE ACTIVE: Using SegmentHelper_oct2025_every40 (~2.5% of full dataset)")
    print(f"   Set SKIP_TESTING = False for production full dataset processing")
else:
    print(f"SKIP_TESTING: False (processing full dataset)")
print("="*70 + "\n")

# TK 4 HSV
Clusters, ImagesClusters, MetaClusters, ClustersMetaClusters = cl.construct_table_classes(table_cluster_type)

def selectSQL():
    if OFFSET: offset = f" OFFSET {str(OFFSET)}"
    else: offset = ""

    # Handle SKIP_TESTING for testing mode
    if SKIP_TESTING:
        # Add JOIN to pre-filtered testing table (every 40th image)
        selectsql = f"""
        SELECT {SELECT} FROM {FROM}
        INNER JOIN SegmentHelper_oct2025_every40 test ON s.image_id = test.image_id
        WHERE {WHERE} LIMIT {str(LIMIT)} {offset};
        """
        print(f"TESTING MODE: Using SegmentHelper_oct2025_every40 (~1-2% of full dataset)")
    else:
        # Normal query without testing table
        selectsql = f"SELECT {SELECT} FROM {FROM} WHERE {WHERE} LIMIT {str(LIMIT)} {offset};"

    print("actual SELECT is: ", selectsql)
    result = engine.connect().execute(text(selectsql))
    resultsjson = ([dict(row) for row in result.mappings()])

    # Log results
    print(f"Fetched {len(resultsjson):,} images")

    return(resultsjson)

def landmarks_to_df_columnar(df, add_list=False, fit_scaler=False):
    first_col = df.columns[1]
    print("first col: ",first_col)

    if cl.CLUSTER_TYPE == "ObjectFusion":
        print("cl.CLUSTER_TYPE == ObjectFusion, doing it via prepare_features_for_knn_v2", df)
        df_columnar = cl.prepare_features_for_knn_v2(df, fit_scaler=fit_scaler)
        # df_columnar = prepared_df.values
        return df_columnar

    # if the first column is an int, then the columns are integers
    if isinstance(first_col, int):
        numerical_columns = [col for col in df.columns if isinstance(col, int)]
        prefix_dim = False
    elif any(isinstance(col, str) and col.startswith('dim_') for col in df.columns):
        numerical_columns = [col for col in df.columns if col.startswith('dim_')]
        prefix_dim = True
    # set hand_columns = to the numerical_columns in sort.SUBSET_LANDMARKS
    print("lms df columns: ",df.columns)
    if sort.SUBSET_LANDMARKS and (len(numerical_columns)/len(sort.SUBSET_LANDMARKS)) % 1 != 0:
        # only do this if the number of numerical columns is not a multiple of the subset landmarks
        # which is to say: the lms have already been subsetted
        if prefix_dim: subset_columns = [f'dim_{i}' for i in sort.SUBSET_LANDMARKS]
        else: subset_columns = [i for i in sort.SUBSET_LANDMARKS]
        print("subset lms columns: ",subset_columns)
        if USE_HEAD_POSE:
            df = df.apply(sort.weight_face_pose, axis=1)
            head_columns = ['face_x', 'face_y', 'face_z', 'mouth_gap']
            subset_columns += head_columns
    elif cl.CLUSTER_TYPE == "HSV":
        subset_columns = ["hue", "sat", "val"]
    else:
        subset_columns = numerical_columns

    if "image_id" in df.columns:
        df_columnar = df[['image_id'] + subset_columns]
    else:
        df_columnar = df[subset_columns]

    if add_list:
        print("landmarks_to_df_columnar adding obj_bbox_list column")
        df_columnar["obj_bbox_list"] = df[subset_columns].values.tolist()

    # this is the old way before Arms3D subsetting. I'm not sure if there is an actual scenario where i want to assign the whole df
    # if add_list:
    #     print("landmarks_to_df_columnar adding obj_bbox_list column")
    #     df_columnar = df
    #     df_columnar["obj_bbox_list"] = df[subset_columns].values.tolist()
    # else:
    #     df_columnar = df[subset_columns]

    print("landmarks_to_df_columnar at the end these are the columns: ", df_columnar.columns)
    return df_columnar

# flatten_object_detections and prepare_features_for_knn have been moved to ToolsClustering class
# Use cl.flatten_object_detections(), cl.prepare_features_for_knn(), or cl.prepare_features_for_knn_v2() (with StandardScaler)

def kmeans_cluster(df, n_clusters=32, fit_scaler=True):
    # Select only the numerical columns (dim_0 to dim_65)
    print("kmeans_cluster sort.SUBSET_LANDMARKS: ",sort.SUBSET_LANDMARKS)


    if cl.CLUSTER_TYPE in ["BodyPoses", "BodyPoses3D", "ArmsPoses3D", "ObjectFusion"]:
        print("cl.CLUSTER_TYPE == BodyPoses || ArmsPoses3D", df)
        df_columnar = landmarks_to_df_columnar(df, fit_scaler=fit_scaler)
    else:
        df_columnar = df
    print("clustering subset data shape: ", df_columnar.shape)
    if hasattr(df_columnar, 'iloc'):
        print("first row of numerical data: ", df_columnar.iloc[0])
    else:
        print("columnar is not a df: ")


    kmeans = KMeans(n_clusters=n_clusters, n_init=10, init='k-means++', random_state=42, max_iter=300, verbose=1)
    kmeans.fit(df_columnar)
    clusters = kmeans.predict(df_columnar)
    return clusters

def best_score(df):
    print("starting best score", df)
    print("about to subset landmarks to thse columns: ",sort.SUBSET_LANDMARKS)
    df = landmarks_to_df_columnar(df)
    print("about to best score with subset data", df)

    n_list=np.linspace(4,24,6,dtype='int')
    score=np.zeros(len(n_list))
    for i,n_clusters in enumerate(n_list):
        kmeans = KMeans(n_clusters,n_init=10, init = 'k-means++', random_state = 42, max_iter = 300)
        preds = kmeans.fit_predict(df)
        score[i]=silhouette_score(df, preds)
    print(n_list, score)
    b_score=n_list[np.argmax(score)]

    return b_score

def geometric_median(X, eps=1e-5, zero_threshold=1e-6):
    """
    Compute the geometric median of an array of points using Weiszfeld's algorithm.
    Args:
        X: A 2D numpy array where each row is a point in n-dimensional space.
        eps: Convergence threshold.
        zero_threshold: Threshold below which values are set to zero.
    Returns:
        The geometric median or None if X is empty or invalid.
    """
    if len(X) == 0:
        return None

    def distance_sum(y, X):
        return np.sum(np.linalg.norm(X - y, axis=1))

    # Initial guess: mean of the points
    initial_guess = np.mean(X, axis=0)
    result = minimize(distance_sum, initial_guess, args=(X,), method='COBYLA', tol=eps)

    # Set values close to zero to exactly zero
    result_x = result.x
    result_x[np.abs(result_x) < zero_threshold] = 0
    return result_x

def calc_cluster_median(df, col_list, cluster_id):
    cluster_df = df[df['cluster_id'] == cluster_id]

    if "top_face_object" in cluster_df.columns:
        print("flattening object detection columns for cluster median calculation")
        cluster_df = cluster_df.copy()  # To avoid SettingWithCopyWarning
        # drop "image_d_id" if it exists, because it will mess with the knn input
        if 'image_id' in cluster_df.columns:
            cluster_df = cluster_df.drop(columns=['image_id'])
        # Use existing scaler (fit_scaler=False) for median calculation
        prepared_cluster_df = cl.prepare_features_for_knn_v2(cluster_df, fit_scaler=False)
        cluster_points = prepared_cluster_df.values
        print(f"Cluster {cluster_id} data after flattening: {prepared_cluster_df}")
        print(f"Cluster {cluster_id} points after flattening: {cluster_points}")
    else:
        print("calculating cluster median without object detection columns")
        print(f"Cluster {cluster_id} data: {cluster_df}")
        # Convert the selected dimensions into a NumPy array
        cluster_points = cluster_df[col_list].values
    print("cluster_points",(cluster_points[0]))
    # Check if there are valid points in the cluster
    if len(cluster_points) == 0 or np.isnan(cluster_points).any():
        print(f"No valid points for cluster {cluster_id}, skipping median calculation.")
        return None

    # print(f"Cluster {cluster_id} points: {cluster_points}")

    # Calculate the geometric median for the cluster points
    cluster_median = geometric_median(cluster_points)

    return cluster_median

def build_col_list(df):
    print("building col list for df columns: ", df.columns)
    col_list = {}
    col_list["left"] = col_list["right"] = col_list["body_lms"] = col_list["face"] = col_list["ObjectFusion"] =[]
    if "body" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
        # tests data_column, so works for ArmsPoses3D too
        second_column_name = df.columns[1]
        # if the second column == 0, then compress the columsn into a list:
        if second_column_name == 0:
            print("compressing body_lms columns into a list")
            # If columns are integers, convert them to strings before checking prefix
            col_list["body_lms"] = [col for col in df.columns]
        else:
            col_list["body_lms"] = [col for col in df.columns if col.startswith('dim_')]
    elif "hand" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
    # elif cl.CLUSTER_TYPE in ["HandsPositions", "HandsGestures", "FingertipsPositions"]:
        col_list["left"] = [col for col in df.columns if col.startswith('left_dim_')]
        col_list["right"] = [col for col in df.columns if col.startswith('right_dim_')]
    elif cl.CLUSTER_TYPE == "HSV":
        col_list["HSV"] = ["hue", "sat", "val"]
    elif cl.CLUSTER_TYPE == "Clusters":
        col_list["face"] = [col for col in df.columns if col.startswith('dim_')]
    elif cl.CLUSTER_TYPE == "ObjectFusion":
        # for ObjectFusion, we need to get pitch, yaw, roll and object detection columns
        col_list["ObjectFusion"] = ['pitch', 'yaw', 'roll','both_hands_object', 'left_hand_object', 'right_hand_object',
                          'top_face_object', 'bottom_face_object']
    return col_list

def zero_out_medians(cluster_median):
    for key in cluster_median.keys():
        if cluster_median[key] is not None and all([abs(val) < .01 for val in cluster_median[key]]):
            print(f" --- >>> Setting cluster {key} median to all zeros.")
            cluster_median[key] = [0.0 for _ in cluster_median[key]]
    return cluster_median

def calculate_cluster_medians(df):
    median_dict = {}
    col_list = build_col_list(df)

    print(f"Columns used for median calculation: {col_list}")
    print(f"All DataFrame columns for cl.CLUSTER_TYPE {cl.CLUSTER_TYPE}: {df.columns}")

    print(df)
    unique_clusters = set(df['cluster_id'])
    for cluster_id in unique_clusters:
        # cluster_median = calc_cluster_median(df, col_list, cluster_id)
        cluster_median = {}
        if cl.CLUSTER_TYPE == "ObjectFusion":
            print(f"[first call in elif] Calculating median for ObjectFusion cluster {cluster_id}")
            cluster_median["ObjectFusion"] = calc_cluster_median(df, col_list["ObjectFusion"], cluster_id)
            print(f"[first call in elif] Recalculated median for cluster {cluster_id}: {cluster_median}")
        elif "body" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
        # if cl.CLUSTER_TYPE in ("BodyPoses", "BodyPoses3D", "ArmsPoses3D"):
            cluster_median["body_lms"] = calc_cluster_median(df, col_list["body_lms"], cluster_id)
        elif "hand" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
        # elif cl.CLUSTER_TYPE in ["HandsPositions", "HandsGestures", "FingertipsPositions"]:
            cluster_median["left"] = calc_cluster_median(df, col_list["left"], cluster_id)
            cluster_median["right"] = calc_cluster_median(df, col_list["right"], cluster_id)
        elif cl.CLUSTER_TYPE == "HSV":
            cluster_median["HSV"] = calc_cluster_median(df, col_list["HSV"], cluster_id)
        elif cl.CLUSTER_TYPE == "Clusters":
            cluster_median["face"] = calc_cluster_median(df, col_list["face"], cluster_id)

        if cluster_median is not None:
            print(f"Recalculated median for cluster {cluster_id}: {cluster_median}")
            # if every value in the cluster median < .01, then set all values to 0.0
            cluster_median = zero_out_medians(cluster_median)
            # add left and right hands
            if cl.CLUSTER_TYPE == "ObjectFusion":
                flattened_median = cluster_median["ObjectFusion"]
            elif "hand" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
            # if cl.CLUSTER_TYPE in ["HandsPositions", "HandsGestures", "FingertipsPositions"]:
                flattened_median = np.concatenate((cluster_median["left"], cluster_median["right"]))
            elif "body" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
                flattened_median = cluster_median["body_lms"]
            elif cl.CLUSTER_TYPE == "HSV":
                flattened_median = cluster_median["HSV"]
            elif cl.CLUSTER_TYPE == "Clusters":
                flattened_median = cluster_median["face"]
            print(f"compressed cluster_median for {cluster_id}: {flattened_median}")
            median_dict[cluster_id] = flattened_median
        else:
            print(f"Cluster {cluster_id} has no valid points or data.")
    return median_dict

# TK 4 HSV
def set_cluster_metacluster():
    if META:
        this_Cluster = MetaClusters
        this_CrosswalkClusters = ClustersMetaClusters
    else:
        this_Cluster = Clusters
        this_CrosswalkClusters = ImagesClusters
    return this_Cluster, this_CrosswalkClusters


def save_clusters_DB(median_dict, update=False):
    # Convert to set and Save the df to a table

    print("save_clusters_DB median_dict", median_dict)
    cluster_ids = median_dict.keys()
    print("save_clusters_DB cluster_ids", cluster_ids)
    this_cluster, this_crosswalk = set_cluster_metacluster()
    print("this_cluster: ", this_cluster)
    for cluster_id in cluster_ids:
        cluster_median = median_dict[cluster_id]

        # store the data in the database
        # Explicitly handle cluster_id 0
        # if cluster_id == 0:
        #     print("Handling cluster_id 0 explicitly, checking for cluster 1. this cluster is ", cluster_id)
        #     existing_record = session.query(Clusters).filter_by(cluster_id=1).first()
        # else:

        print("Checking for existing record with cluster_id ", cluster_id)
        # Check if the record already exists

        # this is where the timeout error is happening:
        '''
        sqlalchemy.exc.OperationalError: (pymysql.err.OperationalError) (2006, "MySQL server has gone away (BrokenPipeError(32, 'Broken pipe'))")
[SQL: SELECT `BodyPoses3D`.cluster_id AS `BodyPoses3D_cluster_id`, `BodyPoses3D`.cluster_median AS `BodyPoses3D_cluster_median`
FROM `BodyPoses3D`'''
        existing_record = session.query(this_cluster).filter_by(cluster_id=cluster_id).first()

        if existing_record is None:
            # Save the geometric median into the database
            print(f"Saving new record with cluster_id {cluster_id}")
            instance = this_cluster(
                cluster_id=cluster_id,
                cluster_median=pickle.dumps(cluster_median)  # Serialize the geometric median
            )
            session.add(instance)
            session.flush()  # Force database insertion to catch issues early
            session.commit()
            if cluster_id == 0:

                saved_record = session.query(this_cluster).filter_by(cluster_id=0).first()
                if saved_record:
                    print(f"Successfully saved cluster_id 0: {saved_record}")
                else:
                    print("Failed to save cluster_id 0.")
        elif existing_record is not None and update:
            print(f"Updating existing record with cluster_id {cluster_id} and median {cluster_median}")
            existing_record.cluster_median = pickle.dumps(cluster_median)

        else:
            print(f"Skipping duplicate record with cluster_id {cluster_id}")

    try:
        print(f"Attempting to commit session with {len(median_dict)}:")
        # for cluster_id, cluster_median in median_dict.items():
        #     print(f"Cluster ID: {cluster_id}, Median: {cluster_median}")
        session.commit()
        print("Data saved successfully.")
    except IntegrityError as e:
        session.rollback()
        print(f"Error occurred during data saving: {str(e)}")

# TK 4 HSV - after calculating cluster distances
def save_images_clusters_DB(df):
    #save the df to a table
    image_id = cluster_id = cluster_dist = this_cluster_id = meta_cluster_id = None
    print("save_images_clusters_DB df", df)
    print("columns: ",df.columns)
    this_cluster, this_crosswalk = set_cluster_metacluster()
    print("this_crosswalk: ", this_crosswalk)
    for idx, row in df.iterrows():
        # cluster_id = row['cluster_id']
        # cluster_dist = row['cluster_dist']

        if this_crosswalk == ImagesClusters:
            image_id = row['image_id']
            cluster_id = row['cluster_id']
            cluster_dist = row['cluster_dist']
            if any(pd.isna([image_id, cluster_id, cluster_dist])):
                print(f"Skipping row with NaN values: image_id={image_id}, cluster_id={cluster_id}, cluster_dist={cluster_dist}")
                continue
            existing_record = session.query(ImagesClusters).filter_by(image_id=image_id).first()
        elif this_crosswalk == ClustersMetaClusters:
            this_cluster_id = idx
            meta_cluster_id = row['cluster_id']
            cluster_dist = None
            print("this_cluster_id: ",this_cluster_id, " meta_cluster_id: ",meta_cluster_id)
            # look up the body3D cluster_id
            existing_record = session.query(ClustersMetaClusters).filter_by(cluster_id=this_cluster_id).first()

        if existing_record is None:
            # it may be easier to define this locally, and assign the name via cl.CLUSTER_TYPE
            if this_crosswalk == ImagesClusters:
                instance = ImagesClusters(
                    image_id=image_id,
                    cluster_id=cluster_id,
                    cluster_dist=cluster_dist
                )
            elif this_crosswalk == ClustersMetaClusters:
                instance = ClustersMetaClusters(
                    cluster_id=this_cluster_id,
                    meta_cluster_id=meta_cluster_id,  # here image_id is actually meta_cluster_id
                    cluster_dist=cluster_dist
                )
            session.add(instance)

        elif existing_record is not None:
            if existing_record.cluster_dist is None:
                if (image_id is not None and image_id % 100 == 0) or (this_cluster_id and this_cluster_id % 100 == 0):
                    print(f"Updating existing record with image_id {image_id} to cluster_dist {cluster_dist}")
                existing_record.cluster_dist = cluster_dist
            else:
                print(f"Skipping existing record with image_id {image_id} and cluster_dist {cluster_dist}")
        else:
            print(f"Skipping duplicate record with image_id {image_id}")

    try:
        print(f"Attempting to commit session with {len(df)}:")
        # for _, row in df.iterrows():
        #     # print(f"Image ID: {row['image_id']}, Cluster ID: {row['cluster_id']}, Cluster Dist: {row['cluster_dist']}")
        #     print(row)
        session.commit()
        print("Data saved successfully.")
    except IntegrityError as e:
        session.rollback()
        print(f"Error occurred during data saving: {str(e)}")

def calc_median_dist(enc1, enc2):
    # print("calc_median_dist enc1, enc2", enc1, enc2)
    # print("type enc1, enc2", type(enc1), type(enc2))
    # print("len enc1, enc2", len(enc1), len(enc2))
    return np.linalg.norm(enc1 - enc2, axis=0)

def process_landmarks_cluster_dist(df, df_subset_landmarks):
    first_col = df.columns[1]
    # print(f"process_landmarks_cluster_dist df type {type(df)} first col: ",first_col)
    # print(f"df_subset_landmarks type {type(df_subset_landmarks)} columns", df_subset_landmarks.columns)
    # if the first column is an int, then the columns are integers
    if isinstance(first_col, int):
        dim_columns = [col for col in df_subset_landmarks.columns if isinstance(col, int)]
    elif any(isinstance(col, str) and col.startswith('dim_') for col in df_subset_landmarks.columns):
    # Step 1: Identify columns that contain "_dim_"
        dim_columns = [col for col in df_subset_landmarks.columns if "dim_" in col]
    elif cl.CLUSTER_TYPE == "HSV":
        dim_columns = cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]
    else:
        print("process_landmarks_cluster_dist could not identify dim columns, defaulting to all columns except image_id and metadata")
        # Exclude image_id and hand position metadata columns from distance calculation
        exclude_cols = {'image_id', 'left_pointer_knuckle_norm', 'right_pointer_knuckle_norm', 'left_source', 'right_source'}
        dim_columns = [col for col in df_subset_landmarks.columns if col not in exclude_cols]
        print("process_landmarks_cluster_dist defaulted dim_columns: ", dim_columns)
    print("process_landmarks_cluster_dist dim_columns: ", dim_columns)
    # Step 2: Combine values from these columns into a list for each row
    df_subset_landmarks['enc1'] = df_subset_landmarks[dim_columns].values.tolist()

    # Step 3: Print the result to check
    print("df_subset_landmarks", df_subset_landmarks[['image_id', 'enc1']])
    print("df_subset_landmarks columns", df_subset_landmarks.columns)
    print("df first row", df.iloc[0])
    if 'cluster_id' not in df.columns:
        # assign clusters to all rows
        # df_subset_landmarks.loc[:, ['cluster_id', 'cluster_dist']] = zip(*df_subset_landmarks['enc1'].apply(cl.prep_pose_clusters_enc))
        cluster_results = df_subset_landmarks['enc1'].apply(cl.prep_pose_clusters_enc)
        # df_subset_landmarks['cluster_id'], df_subset_landmarks['cluster_dist'] = zip(*cluster_results)
        df_subset_landmarks[['cluster_id', 'cluster_dist']] = pd.DataFrame(cluster_results.tolist(), index=df_subset_landmarks.index)
    elif df['cluster_id'].isnull().values.any():
        # df_subset_landmarks["cluster_id"], df_subset_landmarks["cluster_dist"] = zip(*df_subset_landmarks["enc1"].apply(cl.prep_pose_clusters_enc))
        df_subset_landmarks.loc[df_subset_landmarks['cluster_id'].isnull(), ['cluster_id', 'cluster_dist']] = \
            zip(*df_subset_landmarks.loc[df_subset_landmarks['cluster_id'].isnull(), 'enc1'].apply(cl.prep_pose_clusters_enc))
    else:
        # Ensure cluster_median is present on df_subset_landmarks by merging on image_id
        if 'cluster_median' not in df_subset_landmarks.columns:
            if 'image_id' in df_subset_landmarks.columns and 'image_id' in df.columns and 'cluster_median' in df.columns:
                # merge cluster_median from df into df_subset_landmarks
                df_subset_landmarks = df_subset_landmarks.merge(df[['image_id', 'cluster_id','cluster_median']], on='image_id', how='left')
            else:
                # fallback: try to copy if same index alignment
                try:
                    df_subset_landmarks['cluster_median'] = df['cluster_median']
                except Exception:
                    print('Could not attach cluster_median to df_subset_landmarks')
        print("df_subset_landmarks before calc_median_dist", df_subset_landmarks.columns)
        # apply calc_median_dist to enc1 and cluster_median
        print("df_subset_landmarks enc1 and cluster_median", df_subset_landmarks[['enc1', 'cluster_median']].iloc[0])
        df_subset_landmarks["cluster_dist"] = df_subset_landmarks.apply(lambda row: calc_median_dist(row['enc1'], row['cluster_median']), axis=1)
    return df_subset_landmarks

# def prep_pose_clusters_enc(enc1):
#     # print("current image enc1", enc1)
#     enc1 = np.array(enc1)

#     this_dist_dict = {}
#     for cluster_id in MEDIAN_DICT:
#         enc2 = MEDIAN_DICT[cluster_id]
#         # print("cluster_id enc2: ", cluster_id,enc2)
#         this_dist_dict[cluster_id] = np.linalg.norm(enc1 - enc2, axis=0)

#     cluster_id, cluster_dist = min(this_dist_dict.items(), key=lambda x: x[1])

#     # print(cluster_id)
#     return cluster_id, cluster_dist

def assign_images_clusters_DB(df):


    #assign clusters to each image's encodings
    print("assigning images to clusters, df at start",df)
    df_subset_landmarks = landmarks_to_df_columnar(df, add_list=True)
    print("df_subset_landmarks after landmarks_to_df_columnar", df_subset_landmarks)

    # if cl.CLUSTER_TYPE in ["BodyPoses","BodyPoses3D","ArmsPoses3D", "HandsGestures", "HandsPositions","FingertipsPositions"]:
    if "hand" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"] or "body" in cl.CLUSTER_DATA[cl.CLUSTER_TYPE]["data_column"]:
        # combine all columns that start with left_dim_ or right_dim_ or dim_ into one list in the "enc1" column
        df_subset_landmarks = process_landmarks_cluster_dist(df, df_subset_landmarks)
    else:
        # this is for obj_bbox_list
        df_subset_landmarks["cluster_id"] = df_subset_landmarks["obj_bbox_list"].apply(cl.prep_pose_clusters_enc)

    # if the cluster_id column contains tuples or lists, unpack them
    if isinstance(df_subset_landmarks["cluster_id"].iloc[0], (list, tuple)):
        df_subset_landmarks[['cluster_id', 'cluster_dist']] = pd.DataFrame(df_subset_landmarks['cluster_id'].tolist(), index=df_subset_landmarks.index)

    print("df_subset_landmarks clustered after apply")
    print(df_subset_landmarks)
    print(df_subset_landmarks[["image_id", "cluster_id","cluster_dist"]])

    # print all rows where cluster_id is 68
    # print(df_subset_landmarks[df_subset_landmarks["cluster_id"] == 68])

    save_images_clusters_DB(df_subset_landmarks)
    print ("saved to imagesclusters")

def df_list_to_cols(df, col_name):

    # Convert the string representation of lists to actual lists
    # df[col_name] = df[col_name].apply(eval)
    df_data = df.drop("image_id", axis=1)

    # drop any rows where col_name is None or NaN
    print(f"Dropping rows with NaN in column '{col_name}'")
    print("Before dropna, df_data len:", len(df_data))
    df_data = df_data.dropna(subset=[col_name])
    print("After dropna, df_data len:", len(df_data))
    try:
        # Create new columns for each coordinate
        num_coords = len(df_data[col_name].iloc[0])
    except Exception as e:
        print(f"Error determining number of coordinates in column '{col_name}': {e}")
        print("df_data", df_data)
        print("df_data", df_data[col_name])
        print("df_data", df_data[col_name].iloc[0])
        return df  # Return the original DataFrame if there's an error
    for i in range(num_coords):
        df[f'dim_{i}'] = df[col_name].apply(lambda x: x[i] if x is not None else None)
    # Drop the original col_name column
    df = df.drop(col_name, axis=1)
    return df

def extract_face_geometry(df, sort):
    """
    Extract face geometry (nose position and face height) from each image.
    Updates df with nose_x, nose_y, face_height columns.
    """
    face_data = []
    for idx, row in df.iterrows():
        image_id = row['image_id']

        # Extract face landmarks and calculate position using SortPose methods
        try:
            face_landmarks = row['face_landmarks'] if 'face_landmarks' in row else None
            if pd.notna(face_landmarks):
                sort.faceLms = io.unpickle_array(face_landmarks)
                if sort.faceLms and hasattr(sort.faceLms, 'landmark') and len(sort.faceLms.landmark) > 0:
                    sort.w = row['w']
                    sort.h = row['h']

                    # Match utilities/draw_norm_lms.py process_face_doc()
                    sort.nose_2d = sort.get_face_2d_point(1)
                    sort.nose_x, sort.nose_y = sort.nose_2d
                    sort.get_faceheight_data()

                    face_data.append({
                        'image_id': image_id,
                        'nose_x': int(sort.nose_x) if sort.nose_x is not None else None,
                        'nose_y': int(sort.nose_y) if sort.nose_y is not None else None,
                        'face_height': int(sort.face_height) if sort.face_height is not None else None
                    })
                else:
                    face_data.append({'image_id': image_id, 'nose_x': None, 'nose_y': None, 'face_height': None})
            else:
                face_data.append({'image_id': image_id, 'nose_x': None, 'nose_y': None, 'face_height': None})
        except Exception as e:
            print(f"Error extracting face geometry for image_id {image_id}: {e}")
            face_data.append({'image_id': image_id, 'nose_x': None, 'nose_y': None, 'face_height': None})

    return pd.DataFrame(face_data)

def save_images_detections(df, engine):
    """
    Save hand positions and object detections to ImagesDetections table.

    Args:
        df: DataFrame with clustered data (must have left_pointer_knuckle_norm, right_pointer_knuckle_norm,
                                           left_source, right_source, and detection columns)
        engine: SQLAlchemy engine for database connection
    """
    print("\n[DEBUG] save_images_detections called")
    print(f"[DEBUG] Input df shape: {df.shape}")
    print(f"[DEBUG] Columns in df: {list(df.columns)}")

    # Check if required columns exist
    required_cols = ['left_pointer_knuckle_norm', 'right_pointer_knuckle_norm', 'left_source', 'right_source']
    for col in required_cols:
        if col in df.columns:
            print(f"[DEBUG] ✓ Column '{col}' present")
        else:
            print(f"[DEBUG] ✗ Column '{col}' MISSING")

    if len(df) > 0:
        print(f"[DEBUG] Sample row (first image_id={df.iloc[0].get('image_id')}):")
        print(f"[DEBUG]   left_pointer_knuckle_norm: {df['left_pointer_knuckle_norm'].iloc[0] if 'left_pointer_knuckle_norm' in df.columns else 'N/A'}")
        print(f"[DEBUG]   right_pointer_knuckle_norm: {df['right_pointer_knuckle_norm'].iloc[0] if 'right_pointer_knuckle_norm' in df.columns else 'N/A'}")
        print(f"[DEBUG]   left_source: {df['left_source'].iloc[0] if 'left_source' in df.columns else 'N/A'}")
        print(f"[DEBUG]   right_source: {df['right_source'].iloc[0] if 'right_source' in df.columns else 'N/A'}")

    images_detections_records = []
    records_with_hand_data = 0
    records_with_default_only = 0

    for idx, row in df.iterrows():
        image_id = row.get('image_id', 'UNKNOWN')
        try:
            left_pos = row.get('left_pointer_knuckle_norm')
            right_pos = row.get('right_pointer_knuckle_norm')

            # Helper to extract scalar detection_id from potentially nested structures
            def extract_detection_id(val):
                # Check None first
                if val is None:
                    return None
                # Safely check for NaN/empty
                try:
                    if isinstance(val, (list, tuple)):
                        if len(val) == 0:
                            return None
                        first = val[0]
                        if first is None: