From 5fd2def0d754cc0d6bc3aef4da4cc8b5bb46025c Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole@gmail.com>
Date: Sat, 22 Mar 2025 10:01:15 +0000
Subject: [PATCH 1/9] schedule free, first default version of pytorch and jax

---
 .../schedule_free/jax/submission.py           | 222 ++++++++++++
 .../schedule_free/pytorch/submission.py       | 319 ++++++++++++++++++
 2 files changed, 541 insertions(+)
 create mode 100644 reference_algorithms/schedule_free/jax/submission.py
 create mode 100644 reference_algorithms/schedule_free/pytorch/submission.py

diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
new file mode 100644
index 000000000..0e9f9c298
--- /dev/null
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -0,0 +1,222 @@
+"""Submission file for an Schedule Free AdamW optimizer in Jax."""
+
+import functools
+from typing import Dict, Iterator, List, Tuple
+import optax
+
+from flax import jax_utils
+import jax
+from jax import lax
+import jax.numpy as jnp
+from optax.contrib import schedule_free_adamw
+from algoperf import spec
+
+_GRAD_CLIP_EPS = 1e-6
+
+HPARAMS = {
+    "dropout_rate": 0.1,
+    "learning_rate": 0.0025,
+    "one_minus_beta1": 0.1,
+    "beta2": 0.9955159689799007,
+    "weight_decay": 0.08121616522670176,
+    "warmup_factor": 0.02,
+    "weight_lr_power": 2,
+    "label_smoothing": 0.2,
+    "r": 0.75,
+    "eps": 1e-8,
+}
+
+def init_optimizer_state(workload: spec.Workload,
+                         model_params: spec.ParameterContainer,
+                         model_state: spec.ModelAuxiliaryState,
+                         hyperparameters: spec.Hyperparameters,
+                         rng: spec.RandomState) -> spec.OptimizerState:
+  """Creates an AdamW optimizer and a learning rate schedule."""
+  del model_params
+  del model_state
+  del rng
+  lr=HPARAMS['learning_rate']
+  betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2'])
+  warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75)
+  weight_decay=HPARAMS['weight_decay']
+  weight_lr_power=HPARAMS['weight_lr_power']
+  r=HPARAMS['r']
+
+  opt_init_fn, opt_update_fn = schedule_free_adamw(
+      learning_rate=HPARAMS['learning_rate'],
+      warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75),
+
+      b1=1.0 - HPARAMS['one_minus_beta1'],
+      b2=HPARAMS['beta2'],
+      eps=HPARAMS['eps'],
+      weight_decay=HPARAMS['weight_decay'],
+      weight_lr_power=HPARAMS['weight_lr_power'],
+  )
+  params_zeros_like = jax.tree_map(lambda s: jnp.zeros(s.shape_tuple),
+                                   workload.param_shapes)
+  optimizer_state = opt_init_fn(params_zeros_like)
+
+  return jax_utils.replicate(optimizer_state), opt_update_fn
+
+
+@functools.partial(
+    jax.pmap,
+    axis_name='batch',
+    in_axes=(None, None, 0, 0, 0, 0, 0, None, None),
+    static_broadcasted_argnums=(0, 1),
+    donate_argnums=(2, 3, 4))
+def pmapped_train_step(workload,
+                       opt_update_fn,
+                       model_state,
+                       optimizer_state,
+                       current_param_container,
+                       batch,
+                       rng,
+                       grad_clip,
+                       label_smoothing):
+
+  def _loss_fn(params):
+    """Loss function used for training."""
+    logits, new_model_state = workload.model_fn(
+        params,
+        batch,
+        model_state,
+        spec.ForwardPassMode.TRAIN,
+        rng,
+        update_batch_norm=True)
+    loss_dict = workload.loss_fn(
+        label_batch=batch['targets'],
+        logits_batch=logits,
+        mask_batch=batch.get('weights'),
+        label_smoothing=label_smoothing)
+    summed_loss = loss_dict['summed']
+    n_valid_examples = loss_dict['n_valid_examples']
+    return summed_loss, (n_valid_examples, new_model_state)
+
+  grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
+  (summed_loss, (n_valid_examples, new_model_state)), grad = grad_fn(
+      current_param_container)
+  # Get correct global mean loss and grad.
+  (summed_loss, n_valid_examples, grad) = lax.psum(
+      (summed_loss, n_valid_examples, grad), axis_name='batch')
+  loss = summed_loss / n_valid_examples
+  grad = jax.tree_map(lambda x: x / n_valid_examples, grad)
+
+  grad_norm = jnp.sqrt(
+      sum(jnp.sum(g**2) for g in jax.tree_util.tree_leaves(grad)))
+
+  # Extract the leaves of the pytree
+  leaves = jax.tree_util.tree_leaves(grad)
+  # Count the total number of elements in all leaves
+  total_size = sum(jnp.size(leaf) for leaf in leaves)
+
+  # jax.debug.print('GRAD NORM {}', grad_norm)
+  # jax.debug.print('NUM PARAMS {}', total_size)
+
+  if grad_clip is not None:
+    grad_scaling_factor = grad_clip / (grad_norm + _GRAD_CLIP_EPS)
+    grad_scaling_factor = jax.lax.clamp(min=0.0, x=grad_scaling_factor, max=1.0)
+    grad = jax.tree_map(lambda x: x * grad_scaling_factor, grad)
+
+  updates, new_optimizer_state = opt_update_fn(grad, optimizer_state,
+                                               current_param_container)
+  updated_params = optax.apply_updates(current_param_container, updates)
+  return new_optimizer_state, updated_params, new_model_state, loss, grad_norm
+
+
+def update_params(workload: spec.Workload,
+                  current_param_container: spec.ParameterContainer,
+                  current_params_types: spec.ParameterTypeTree,
+                  model_state: spec.ModelAuxiliaryState,
+                  hyperparameters: spec.Hyperparameters,
+                  batch: Dict[str, spec.Tensor],
+                  loss_type: spec.LossType,
+                  optimizer_state: spec.OptimizerState,
+                  eval_results: List[Tuple[int, float]],
+                  global_step: int,
+                  rng: spec.RandomState) -> spec.UpdateReturn:
+  """Return (updated_optimizer_state, updated_params, updated_model_state)."""
+  del current_params_types
+  del loss_type
+  del eval_results
+
+  optimizer_state, opt_update_fn = optimizer_state
+  per_device_rngs = jax.random.split(rng, jax.local_device_count())
+  if hasattr(hyperparameters, 'label_smoothing'):
+    label_smoothing = hyperparameters.label_smoothing
+  else:
+    label_smoothing = 0.0
+  if hasattr(hyperparameters, 'grad_clip'):
+    grad_clip = hyperparameters.grad_clip
+  else:
+    grad_clip = None
+  outputs = pmapped_train_step(workload,
+                               opt_update_fn,
+                               model_state,
+                               optimizer_state,
+                               current_param_container,
+                               batch,
+                               per_device_rngs,
+                               grad_clip,
+                               label_smoothing)
+  new_optimizer_state, new_params, new_model_state, loss, grad_norm = outputs
+
+  # Log loss, grad_norm.
+  if global_step % 100 == 0 and workload.metrics_logger is not None:
+    workload.metrics_logger.append_scalar_metrics(
+        {
+            'loss': loss[0],
+            'grad_norm': grad_norm[0],
+        }, global_step)
+  return (new_optimizer_state, opt_update_fn), new_params, new_model_state
+
+
+def get_batch_size(workload_name):
+  # Return the global batch size.
+  if workload_name == 'criteo1tb':
+    return 262_144
+  elif workload_name == 'fastmri':
+    return 32
+  elif workload_name == 'imagenet_resnet':
+    return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
+  elif workload_name == 'imagenet_vit':
+    return 1024
+  elif workload_name == 'librispeech_conformer':
+    return 256
+  elif workload_name == 'librispeech_deepspeech':
+    return 256
+  elif workload_name == 'ogbg':
+    return 512
+  elif workload_name == 'wmt':
+    return 128
+  elif workload_name == 'mnist':
+    return 16
+  else:
+    raise ValueError(f'Unsupported workload name: {workload_name}.')
+
+
+def data_selection(workload: spec.Workload,
+                   input_queue: Iterator[Dict[str, spec.Tensor]],
+                   optimizer_state: spec.OptimizerState,
+                   current_param_container: spec.ParameterContainer,
+                   model_state: spec.ModelAuxiliaryState,
+                   hyperparameters: spec.Hyperparameters,
+                   global_step: int,
+                   rng: spec.RandomState) -> Dict[str, spec.Tensor]:
+  """Select data from the infinitely repeating, pre-shuffled input queue.
+  Each element of the queue is a batch of training examples and labels.
+  """
+  del workload
+  del optimizer_state
+  del current_param_container
+  del model_state
+  del hyperparameters
+  del global_step
+  del rng
+  batch = next(input_queue)
+  return batch
+
diff --git a/reference_algorithms/schedule_free/pytorch/submission.py b/reference_algorithms/schedule_free/pytorch/submission.py
new file mode 100644
index 000000000..450745737
--- /dev/null
+++ b/reference_algorithms/schedule_free/pytorch/submission.py
@@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# 
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Dict, Iterator, List, Tuple
+from absl import logging
+import torch
+import torch.distributed.nn as dist_nn
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
+
+USE_PYTORCH_DDP = pytorch_setup()[0]
+HPARAMS = {
+    "dropout_rate": 0.1,
+    "learning_rate": 0.0025,
+    "one_minus_beta1": 0.1,
+    "beta2": 0.9955159689799007,
+    "weight_decay": 0.08121616522670176,
+    "warmup_factor": 0.02,
+    "weight_lr_power": 2,
+    "label_smoothing": 0.2,
+    "r": 0.75,
+    "conformer_bs": 192,
+}
+
+
+class AdamWScheduleFree(torch.optim.Optimizer):
+    r"""Schedule Free AdamW
+    """
+    def __init__(self, params, 
+                 lr=1e-3, 
+                 betas=(0.9, 0.999), 
+                 eps=1e-8,
+                 weight_decay=0,
+                 weight_lr_power=2,
+                 warmup_steps=0,
+                 r=0,
+                 ):
+        defaults = dict(lr=lr, 
+                        betas=betas, 
+                        eps=eps,
+                        r=r,
+                        k=0,
+                        weight_sum=0.0,
+                        lr_max=0.0,
+                        warmup_steps=warmup_steps,
+                        weight_lr_power=weight_lr_power,
+                        weight_decay=weight_decay)
+
+        super().__init__(params, defaults)
+
+    def reset(self):
+        for group in self.param_groups:
+            group['k'] = 0
+            group['lr_max'] = 0 
+            group['weight_sum'] = 0
+
+            for p in group['params']:
+                # State initialization
+                state = self.state[p]
+                state['z'].copy_(state['x0'])
+                p.data.copy_(state['x0'])
+                state['exp_avg_sq'].zero_()
+
+    def step(self, closure):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        # Swap to extrapolated point:
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            r = group['r']
+            k = group['k']
+
+            for p in group['params']:
+                # State initialization
+                state = self.state[p]
+                if 'z' not in state:
+                    state['z'] = torch.clone(p.data)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.bfloat16)
+                    state['x0'] = p.data.cpu()                    
+
+                z = state['z']
+
+                # Extrapolate
+                #p = p + (1-beta1)*(z-p)
+                #p.data.mul_(beta1).add_(z, alpha=1-beta1)
+                p.data.lerp_(end=z, weight=1-beta1)
+
+        # Evaluate gradient at extrapolated point
+        loss = closure()
+
+        for group in self.param_groups:
+            eps = group['eps']
+            k = group['k']
+            warmup_steps = group['warmup_steps']
+            
+            if k < warmup_steps:
+              sched = (k+1) / warmup_steps
+            else:
+              sched = 1.0
+            annealed_lr = group['lr']*sched
+
+            lr = max(annealed_lr, eps)
+
+            decay = group['weight_decay']
+            beta1, beta2 = group['betas']
+            weight_lr_power = group['weight_lr_power']
+            
+            r = group['r']
+            lr_max = group['lr_max'] = max(lr, group['lr_max'])
+            
+            weight = ((k+1)**r) * (lr_max**weight_lr_power)
+            weight_sum = group['weight_sum'] = group['weight_sum'] + weight
+
+            ckp1 = weight/weight_sum
+
+            bias_correction2 = 1 - beta2 ** (k+1)
+            step_size = lr * math.sqrt(bias_correction2)
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                
+                state = self.state[p]
+
+                exp_avg_sq = state['exp_avg_sq']
+                z = state['z']
+
+                # Unextrapolate
+                #p = (p - (1-beta1)*z)/beta1
+                #p.data.sub_(z, alpha=1-beta1).div_(beta1)
+                p.data.lerp_(end=z, weight=1-1/beta1)
+
+                # Decay the first and second moment running average coefficient
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1-beta2)
+                denom = exp_avg_sq.sqrt().add_(eps)
+
+                z.addcdiv_(grad, denom, value=-step_size)
+
+                # Decay
+                z.sub_(p.data, alpha=step_size*decay)
+
+                ### Take step
+                #p.data.mul_(1-ckp1).add_(z, alpha=ckp1)
+                p.data.lerp_(end=z, weight=ckp1)
+
+            group['k'] = k+1
+        return loss
+
+def init_optimizer_state(workload: spec.Workload,
+                         model_params: spec.ParameterContainer,
+                         model_state: spec.ModelAuxiliaryState,
+                         hyperparameters: spec.Hyperparameters,
+                         rng: spec.RandomState) -> spec.OptimizerState:
+  del model_state
+
+  optimizer = AdamWScheduleFree(
+    model_params.parameters(),
+    lr=HPARAMS['learning_rate'],
+    betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2']),
+    warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75),
+    weight_decay=HPARAMS['weight_decay'],
+    weight_lr_power=HPARAMS['weight_lr_power'],
+    r=HPARAMS['r'])
+
+  optimizer_state = {'optimizer':optimizer, 'max_checked_eval_step': -1, 'has_forced_reset': False, 'first_eval': False, }
+  return optimizer_state
+
+def update_params(workload: spec.Workload,
+                  current_param_container: spec.ParameterContainer,
+                  current_params_types: spec.ParameterTypeTree,
+                  model_state: spec.ModelAuxiliaryState,
+                  hyperparameters: spec.Hyperparameters,
+                  batch: Dict[str, spec.Tensor],
+                  loss_type: spec.LossType,
+                  optimizer_state: spec.OptimizerState,
+                  eval_results: List[Tuple[int, float]],
+                  global_step: int,
+                  rng: spec.RandomState) -> spec.UpdateReturn:
+  """Return (updated_optimizer_state, updated_params, updated_model_state)."""
+  del current_params_types
+  del loss_type
+  del hyperparameters
+
+
+  metric_name = workload.target_metric_name
+  # TODO - remove force_reset
+  eval_step = len(eval_results)
+  if (global_step > workload.step_hint*0.10) and optimizer_state['max_checked_eval_step'] < eval_step:
+    optimizer_state['max_checked_eval_step'] = eval_step
+    # Don't do resetting on workloads that don't run eval often enough
+    if len(eval_results) >= 4: # and optimizer_state["first_eval_is_far_from_target"]:
+      val_metric = f"validation/{metric_name}"
+      initial_eval = eval_results[0][1][val_metric]
+      latest_eval = eval_results[-1][1][val_metric]
+      second_latest_eval = eval_results[-2][1][val_metric]
+      third_latest_eval = eval_results[-3][1][val_metric]
+      fourth_latest_eval = eval_results[-4][1][val_metric]
+      MARGIN = 0.01
+      if metric_name in ["loss", "wer"]:
+        # Decreasing eval workloads should be flipped
+        initial_eval = -initial_eval
+        latest_eval = -latest_eval
+        second_latest_eval = -second_latest_eval
+        third_latest_eval = -third_latest_eval
+        fourth_latest_eval = -fourth_latest_eval
+      # Higher is better
+      # scale as a curve from 0 --> 1
+      # if the eval values are far from the target (i.e. - worse than initial) and stays far from the target for 4 evals
+      if (latest_eval - initial_eval < MARGIN) and (latest_eval - second_latest_eval < MARGIN) and (second_latest_eval - third_latest_eval < MARGIN) and (third_latest_eval - fourth_latest_eval < MARGIN):
+        # Reset parameters since we appear to have diverged
+        logging.info("Reseting All Weights ")
+        logging.info(f"Global Step: {global_step}")
+        optimizer_state['has_forced_reset'] = True
+
+        # Perform reset
+        del model_state
+        model_state = None
+        optimizer_state['optimizer'].reset()
+
+        # Decrease learning rate by 2x if it diverged.
+        for param_group in optimizer_state['optimizer'].param_groups:
+          param_group['lr'] = param_group['lr']/2.0
+
+  ###########
+
+  current_model = current_param_container
+  current_model.train()
+
+  new_model_state = None
+
+  def closure():
+    nonlocal new_model_state
+    optimizer_state['optimizer'].zero_grad()
+
+    logits_batch, new_model_state = workload.model_fn(
+        params=current_model,
+        augmented_and_preprocessed_input_batch=batch,
+        model_state=model_state,
+        mode=spec.ForwardPassMode.TRAIN,
+        rng=rng,
+        update_batch_norm=True)
+
+    loss_dict = workload.loss_fn(
+        label_batch=batch['targets'],
+        logits_batch=logits_batch,
+        mask_batch=batch.get('weights'),
+        label_smoothing=HPARAMS['label_smoothing'])
+    summed_loss = loss_dict['summed']
+    n_valid_examples = loss_dict['n_valid_examples']
+    if USE_PYTORCH_DDP:
+      # Use dist_nn.all_reduce to ensure correct loss and gradient scaling.
+      summed_loss = dist_nn.all_reduce(summed_loss)
+      n_valid_examples = dist_nn.all_reduce(n_valid_examples)
+    loss = summed_loss / n_valid_examples
+
+    loss.backward()
+    return loss
+
+  loss = optimizer_state['optimizer'].step(closure)
+
+  return (optimizer_state, current_param_container, new_model_state)
+
+
+def get_batch_size(workload_name):
+  # Return the global batch size.
+  if workload_name == 'criteo1tb':
+    return 262_144
+  elif workload_name == 'fastmri':
+    return 16 # 32
+  elif workload_name == 'imagenet_resnet':
+    return 1024
+  elif workload_name == 'imagenet_vit':
+    return 1024
+  elif workload_name == 'librispeech_conformer':
+    return 224
+  elif workload_name == 'librispeech_deepspeech':
+    return 128 # 256
+  elif workload_name == 'ogbg':
+    return 512
+  elif workload_name == 'wmt':
+    return 128
+  elif workload_name == 'mnist':
+    return 16
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  else:
+    raise ValueError(f'Unsupported workload name: {workload_name}.')
+
+def data_selection(workload: spec.Workload,
+                   input_queue: Iterator[Dict[str, spec.Tensor]],
+                   optimizer_state: spec.OptimizerState,
+                   current_param_container: spec.ParameterContainer,
+                   model_state: spec.ModelAuxiliaryState,
+                   hyperparameters: spec.Hyperparameters,
+                   global_step: int,
+                   rng: spec.RandomState) -> Dict[str, spec.Tensor]:
+  """Select data from the infinitely repeating, pre-shuffled input queue.
+  Each element of the queue is a batch of training examples and labels.
+  """
+  del workload
+  del optimizer_state
+  del current_param_container
+  del model_state
+  del hyperparameters
+  del global_step
+  del rng
+  batch = next(input_queue)
+  return batch
+

From 19fe1f36567c6930f8fd5d4b70bbb24e1e676269 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Mon, 30 Jun 2025 18:48:23 +0000
Subject: [PATCH 2/9] fix: setting it up like before

---
 .../criteo1tb/criteo1tb_jax/workload.py       |   4 +
 custom_pytorch_jax_converter.py               | 186 ++++++++++++++++++
 .../schedule_free/jax/submission.py           |  10 +
 .../schedule_free/pytorch/submission.py       |   8 +
 4 files changed, 208 insertions(+)
 create mode 100644 custom_pytorch_jax_converter.py

diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
index 91761e458..91b7adcb9 100644
--- a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
+++ b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -13,6 +13,8 @@
 from algoperf.workloads.criteo1tb.criteo1tb_jax import models
 from algoperf.workloads.criteo1tb.workload import \
     BaseCriteo1TbDlrmSmallWorkload
+from custom_pytorch_jax_converter import use_pytorch_weights_inplace, use_pytorch_weights_inplace_mnist
+
 
 
 class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload):
@@ -103,8 +105,10 @@ def init_model_fn(
         {'params': params_rng, 'dropout': dropout_rng},
         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
+    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_24_june.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
+    return initial_params, None
     return jax_utils.replicate(initial_params), None
 
   def is_output_params(self, param_key: spec.ParameterKey) -> bool:
diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
new file mode 100644
index 000000000..25cebd618
--- /dev/null
+++ b/custom_pytorch_jax_converter.py
@@ -0,0 +1,186 @@
+import torch
+import numpy as np
+import jax
+import jax.numpy as jnp
+import logging
+import copy
+import copy
+from jax.tree_util import tree_map
+"""
+Jax default parameter structure:
+dict_keys(['Dense_0', 'Dense_1', 'Dense_2', 'Dense_3', 'Dense_4', 'Dense_5', 'Dense_6', 'Dense_7', 'embedding_table'])
+
+Pytorch stateduct structure:
+dict_keys(['embedding_chunk_0', 'embedding_chunk_1', 'embedding_chunk_2', 'embedding_chunk_3', 'bot_mlp.0.weight', 'bot_mlp.0.bias', 'bot_mlp.2.weight', 'bot_mlp.2.bias', 'bot_mlp.4.weight', 'bot_mlp.4.bias', 'top_mlp.0.weight', 'top_mlp.0.bias', 'top_mlp.2.weight', 'top_mlp.2.bias', 'top_mlp.4.weight', 'top_mlp.4.bias', 'top_mlp.6.weight', 'top_mlp.6.bias', 'top_mlp.8.weight', 'top_mlp.8.bias'])
+
+
+
+The following function converts the PyTorch weights to the Jax format
+and assigns them to the Jax model parameters.
+The function assumes that the Jax model parameters are already initialized
+and that the PyTorch weights are in the correct format.
+"""
+def use_pytorch_weights_inplace(jax_params, file_name=None, replicate=False):
+
+    # Load PyTorch state_dict
+    state_dict = torch.load(file_name)
+    print(state_dict.keys())
+    # Convert PyTorch tensors to NumPy arrays
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
+
+    # --- Embedding Table ---
+    embedding_table = np.concatenate([
+        numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
+    ], axis=0)  # adjust axis depending on chunking direction
+
+    jax_params['embedding_table'] = jnp.array(embedding_table)
+
+    # --- Bot MLP: Dense_0 to Dense_2 ---
+    for i, j in zip([0, 2, 4], range(3)):
+        jax_params[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
+        jax_params[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
+
+    # --- Top MLP: Dense_3 to Dense_7 ---
+    for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
+        jax_params[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
+        jax_params[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
+    #jax_params = tree_map(lambda x: jnp.array(x), jax_params) 
+    del state_dict
+    return jax_params
+
+
+def use_pytorch_weights_inplace_mnist(jax_params, file_name=None, replicate=False):
+    # Load the PyTorch checkpoint
+    ckpt = torch.load(file_name)
+    state_dict = ckpt['state_dict'] if 'state_dict' in ckpt else ckpt
+
+    print("Loaded PyTorch keys:", state_dict.keys())
+
+    # Convert to numpy
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}
+
+    # Mapping PyTorch keys → JAX Dense layers
+    layer_map = {
+        'net.layer1': 'Dense_0',
+        'net.layer2': 'Dense_1',
+    }
+
+    for pt_name, jax_name in layer_map.items():
+        weight_key = f"{pt_name}.weight"
+        bias_key = f"{pt_name}.bias"
+
+        if weight_key not in numpy_weights or bias_key not in numpy_weights:
+            raise KeyError(f"Missing keys: {weight_key} or {bias_key} in PyTorch weights")
+
+        jax_params[jax_name]['kernel'] = jnp.array(numpy_weights[weight_key].T)  # Transpose!
+        jax_params[jax_name]['bias'] = jnp.array(numpy_weights[bias_key])
+
+    return jax_params
+
+
+# def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
+#     """Compares two JAX PyTrees of weights and prints where they differ."""
+#     all_equal = True
+
+#     def compare_fn(p1, p2):
+#         nonlocal all_equal
+#         #if not jnp.allclose(p1, p2):
+#         if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
+#             logging.info("❌ Mismatch found:")
+#             logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
+#             logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
+#             all_equal = False
+#         return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
+
+#     try:
+#         _ = jax.tree_util.tree_map(compare_fn, params1, params2)
+#     except Exception as e:
+#         logging.info("❌ Structure mismatch or error during comparison:", e)
+#         return False
+
+#     if all_equal:
+#         logging.info("✅ All weights are equal (within tolerance)")
+#     return all_equal
+
+import jax
+import jax.numpy as jnp
+import logging
+
+def maybe_unreplicate(pytree):
+    """If leading axis matches device count, strip it assuming it's pmap replication."""
+    num_devices = jax.device_count()
+    return jax.tree_util.tree_map(
+        lambda x: x[0] if isinstance(x, jax.Array) and x.shape[0] == num_devices else x,
+        pytree
+    )
+
+def move_to_cpu(tree):
+    return jax.tree_util.tree_map(lambda x: jax.device_put(x, device=jax.devices("cpu")[0]), tree)
+
+
+def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
+    """Compares two JAX PyTrees of weights and logs where they differ, safely handling PMAP replication."""
+    # Attempt to unreplicate if needed
+    params1 = maybe_unreplicate(params1)
+    params2 = maybe_unreplicate(params2)
+
+    params1 = move_to_cpu(params1)
+    params2 = move_to_cpu(params2)
+
+    all_equal = True
+
+    def compare_fn(p1, p2):
+        nonlocal all_equal
+        if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
+            logging.info("❌ Mismatch found:")
+            logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
+            logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
+            all_equal = False
+        return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
+
+    try:
+        jax.tree_util.tree_map(compare_fn, params1, params2)
+    except Exception as e:
+        logging.info("❌ Structure mismatch or error during comparison:", exc_info=True)
+        return False
+
+    if all_equal:
+        logging.info("✅ All weights are equal (within tolerance)")
+    return all_equal
+
+
+
+def use_pytorch_weights2(jax_params, file_name=None, replicate=False):
+
+    def deep_copy_to_cpu(pytree):
+        return tree_map(lambda x: jax.device_put(jnp.array(copy.deepcopy(x)), device=jax.devices("cpu")[0]), pytree)
+
+    breakpoint()
+    jax_copy = deep_copy_to_cpu(jax_params) 
+    # Load PyTorch state_dict lazily to CPU
+    state_dict = torch.load(file_name, map_location='cpu')
+    print(state_dict.keys())
+    # Convert PyTorch tensors to NumPy arrays
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
+
+    # --- Embedding Table ---
+    embedding_table = np.concatenate([
+        numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
+    ], axis=0)  # adjust axis depending on chunking direction
+
+    jax_copy['embedding_table'] = jnp.array(embedding_table)
+
+    # --- Bot MLP: Dense_0 to Dense_2 ---
+    for i, j in zip([0, 2, 4], range(3)):
+        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
+        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
+
+    # --- Top MLP: Dense_3 to Dense_7 ---
+    for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
+        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
+        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
+    #jax_copy = tree_map(lambda x: jnp.array(x), jax_copy) 
+    del state_dict
+
+    return jax_copy
+
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 0e9f9c298..7035e9bef 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -10,6 +10,7 @@
 import jax.numpy as jnp
 from optax.contrib import schedule_free_adamw
 from algoperf import spec
+from custom_pytorch_jax_converter import use_pytorch_weights2, are_weights_equal
 
 _GRAD_CLIP_EPS = 1e-6
 
@@ -168,6 +169,15 @@ def update_params(workload: spec.Workload,
             'loss': loss[0],
             'grad_norm': grad_norm[0],
         }, global_step)
+    
+  # Log the number of parameters.
+  if global_step % 100 == 0 and workload.metrics_logger is not None:
+    date_ = "2025=06-14"
+    file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}.pth"
+    params = use_pytorch_weights2(new_params, file_name=file_name, replicate=True)
+    are_weights_equal(new_params, params)
+    del params
+    
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
 
diff --git a/reference_algorithms/schedule_free/pytorch/submission.py b/reference_algorithms/schedule_free/pytorch/submission.py
index 450745737..ef3135fd8 100644
--- a/reference_algorithms/schedule_free/pytorch/submission.py
+++ b/reference_algorithms/schedule_free/pytorch/submission.py
@@ -266,6 +266,14 @@ def closure():
 
   loss = optimizer_state['optimizer'].step(closure)
 
+  # # current_state  = current_model.module.state_dict() is a workaround for DDP
+  # if global_step==1:
+  #   torch.save(current_model.module.state_dict(), "/results/pytorch_base_model_criteo1tb_11may_global_step2.pth")
+  #   import torch.distributed as dist
+  #   import sys
+  #   dist.destroy_process_group()
+  #   sys.exit(0)
+
   return (optimizer_state, current_param_container, new_model_state)
 
 

From 890f5afbe08674183ee7e09b773d4f58f1a85f82 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Mon, 30 Jun 2025 18:58:31 +0000
Subject: [PATCH 3/9] fix: prev fix to the code

---
 reference_algorithms/schedule_free/jax/submission.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 7035e9bef..879d76b18 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -33,7 +33,7 @@ def init_optimizer_state(workload: spec.Workload,
                          hyperparameters: spec.Hyperparameters,
                          rng: spec.RandomState) -> spec.OptimizerState:
   """Creates an AdamW optimizer and a learning rate schedule."""
-  del model_params
+  model_params
   del model_state
   del rng
   lr=HPARAMS['learning_rate']
@@ -53,9 +53,9 @@ def init_optimizer_state(workload: spec.Workload,
       weight_decay=HPARAMS['weight_decay'],
       weight_lr_power=HPARAMS['weight_lr_power'],
   )
-  params_zeros_like = jax.tree_map(lambda s: jnp.zeros(s.shape_tuple),
-                                   workload.param_shapes)
-  optimizer_state = opt_init_fn(params_zeros_like)
+
+  model_params = jax_utils.unreplicate(model_params)
+  optimizer_state = opt_init_fn(model_params)
 
   return jax_utils.replicate(optimizer_state), opt_update_fn
 
@@ -177,7 +177,7 @@ def update_params(workload: spec.Workload,
     params = use_pytorch_weights2(new_params, file_name=file_name, replicate=True)
     are_weights_equal(new_params, params)
     del params
-    
+
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
 

From 7ec211f98eea309871516e2fb930721c5d78b4c4 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Tue, 1 Jul 2025 04:41:50 +0000
Subject: [PATCH 4/9] getting it to level 0

---
 algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py | 3 +--
 reference_algorithms/schedule_free/jax/submission.py   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
index 91b7adcb9..2cae54dbc 100644
--- a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
+++ b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -13,7 +13,7 @@
 from algoperf.workloads.criteo1tb.criteo1tb_jax import models
 from algoperf.workloads.criteo1tb.workload import \
     BaseCriteo1TbDlrmSmallWorkload
-from custom_pytorch_jax_converter import use_pytorch_weights_inplace, use_pytorch_weights_inplace_mnist
+from custom_pytorch_jax_converter import use_pytorch_weights_inplace
 
 
 
@@ -108,7 +108,6 @@ def init_model_fn(
     initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_24_june.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
-    return initial_params, None
     return jax_utils.replicate(initial_params), None
 
   def is_output_params(self, param_key: spec.ParameterKey) -> bool:
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 879d76b18..fabf8e350 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -172,8 +172,8 @@ def update_params(workload: spec.Workload,
     
   # Log the number of parameters.
   if global_step % 100 == 0 and workload.metrics_logger is not None:
-    date_ = "2025=06-14"
-    file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}.pth"
+    date_ = "2025-06-14"
+    file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
     params = use_pytorch_weights2(new_params, file_name=file_name, replicate=True)
     are_weights_equal(new_params, params)
     del params

From a775004a0b84bde202f33463adc9c9b7ec47062a Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Tue, 1 Jul 2025 05:15:06 +0000
Subject: [PATCH 5/9] shuffling the functions for better understanding

---
 custom_pytorch_jax_converter.py               | 115 +++++++++---------
 .../schedule_free/jax/submission.py           |   4 +-
 2 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
index 25cebd618..1358e823e 100644
--- a/custom_pytorch_jax_converter.py
+++ b/custom_pytorch_jax_converter.py
@@ -6,6 +6,7 @@
 import copy
 import copy
 from jax.tree_util import tree_map
+
 """
 Jax default parameter structure:
 dict_keys(['Dense_0', 'Dense_1', 'Dense_2', 'Dense_3', 'Dense_4', 'Dense_5', 'Dense_6', 'Dense_7', 'embedding_table'])
@@ -44,11 +45,45 @@ def use_pytorch_weights_inplace(jax_params, file_name=None, replicate=False):
     for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
         jax_params[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
         jax_params[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
-    #jax_params = tree_map(lambda x: jnp.array(x), jax_params) 
+   
     del state_dict
     return jax_params
 
 
+def use_pytorch_weights_cpu_copy(jax_params, file_name=None, replicate=False):
+
+    def deep_copy_to_cpu(pytree):
+        return tree_map(lambda x: jax.device_put(jnp.array(copy.deepcopy(x)), device=jax.devices("cpu")[0]), pytree)
+
+    jax_copy = deep_copy_to_cpu(jax_params) 
+    # Load PyTorch state_dict lazily to CPU
+    state_dict = torch.load(file_name, map_location='cpu')
+    print(state_dict.keys())
+    # Convert PyTorch tensors to NumPy arrays
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
+
+    # --- Embedding Table ---
+    embedding_table = np.concatenate([
+        numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
+    ], axis=0)  # adjust axis depending on chunking direction
+
+    jax_copy['embedding_table'] = jnp.array(embedding_table)
+
+    # --- Bot MLP: Dense_0 to Dense_2 ---
+    for i, j in zip([0, 2, 4], range(3)):
+        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
+        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
+
+    # --- Top MLP: Dense_3 to Dense_7 ---
+    for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
+        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
+        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
+    #jax_copy = tree_map(lambda x: jnp.array(x), jax_copy) 
+    del state_dict
+
+    return jax_copy
+
+
 def use_pytorch_weights_inplace_mnist(jax_params, file_name=None, replicate=False):
     # Load the PyTorch checkpoint
     ckpt = torch.load(file_name)
@@ -78,34 +113,6 @@ def use_pytorch_weights_inplace_mnist(jax_params, file_name=None, replicate=Fals
     return jax_params
 
 
-# def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
-#     """Compares two JAX PyTrees of weights and prints where they differ."""
-#     all_equal = True
-
-#     def compare_fn(p1, p2):
-#         nonlocal all_equal
-#         #if not jnp.allclose(p1, p2):
-#         if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
-#             logging.info("❌ Mismatch found:")
-#             logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
-#             logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
-#             all_equal = False
-#         return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
-
-#     try:
-#         _ = jax.tree_util.tree_map(compare_fn, params1, params2)
-#     except Exception as e:
-#         logging.info("❌ Structure mismatch or error during comparison:", e)
-#         return False
-
-#     if all_equal:
-#         logging.info("✅ All weights are equal (within tolerance)")
-#     return all_equal
-
-import jax
-import jax.numpy as jnp
-import logging
-
 def maybe_unreplicate(pytree):
     """If leading axis matches device count, strip it assuming it's pmap replication."""
     num_devices = jax.device_count()
@@ -150,37 +157,27 @@ def compare_fn(p1, p2):
 
 
 
-def use_pytorch_weights2(jax_params, file_name=None, replicate=False):
-
-    def deep_copy_to_cpu(pytree):
-        return tree_map(lambda x: jax.device_put(jnp.array(copy.deepcopy(x)), device=jax.devices("cpu")[0]), pytree)
-
-    breakpoint()
-    jax_copy = deep_copy_to_cpu(jax_params) 
-    # Load PyTorch state_dict lazily to CPU
-    state_dict = torch.load(file_name, map_location='cpu')
-    print(state_dict.keys())
-    # Convert PyTorch tensors to NumPy arrays
-    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
-
-    # --- Embedding Table ---
-    embedding_table = np.concatenate([
-        numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
-    ], axis=0)  # adjust axis depending on chunking direction
-
-    jax_copy['embedding_table'] = jnp.array(embedding_table)
+# def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
+#     """Compares two JAX PyTrees of weights and prints where they differ."""
+#     all_equal = True
 
-    # --- Bot MLP: Dense_0 to Dense_2 ---
-    for i, j in zip([0, 2, 4], range(3)):
-        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
-        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
+#     def compare_fn(p1, p2):
+#         nonlocal all_equal
+#         #if not jnp.allclose(p1, p2):
+#         if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
+#             logging.info("❌ Mismatch found:")
+#             logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
+#             logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
+#             all_equal = False
+#         return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
 
-    # --- Top MLP: Dense_3 to Dense_7 ---
-    for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
-        jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
-        jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
-    #jax_copy = tree_map(lambda x: jnp.array(x), jax_copy) 
-    del state_dict
+#     try:
+#         _ = jax.tree_util.tree_map(compare_fn, params1, params2)
+#     except Exception as e:
+#         logging.info("❌ Structure mismatch or error during comparison:", e)
+#         return False
 
-    return jax_copy
+#     if all_equal:
+#         logging.info("✅ All weights are equal (within tolerance)")
+#     return all_equal
 
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index fabf8e350..4d226a2f6 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -10,7 +10,7 @@
 import jax.numpy as jnp
 from optax.contrib import schedule_free_adamw
 from algoperf import spec
-from custom_pytorch_jax_converter import use_pytorch_weights2, are_weights_equal
+from custom_pytorch_jax_converter import use_pytorch_weights_cpu_copy, are_weights_equal
 
 _GRAD_CLIP_EPS = 1e-6
 
@@ -174,7 +174,7 @@ def update_params(workload: spec.Workload,
   if global_step % 100 == 0 and workload.metrics_logger is not None:
     date_ = "2025-06-14"
     file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
-    params = use_pytorch_weights2(new_params, file_name=file_name, replicate=True)
+    params = use_pytorch_weights_cpu_copy(new_params, file_name=file_name, replicate=True)
     are_weights_equal(new_params, params)
     del params
 

From 9837910dfbb1e69aa68421882cad3b3481bc1ed2 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Wed, 2 Jul 2025 14:48:19 +0000
Subject: [PATCH 6/9] changes to save and test the models

---
 .../criteo1tb/criteo1tb_jax/workload.py       |   2 +-
 .../criteo1tb/criteo1tb_pytorch/workload.py   |   1 +
 custom_pytorch_jax_converter.py               |   3 +
 .../schedule_free/jax/submission.py           |   8 +-
 submission_runner.py                          | 263 +++++++++---------
 5 files changed, 148 insertions(+), 129 deletions(-)

diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
index 2cae54dbc..8d98f09ea 100644
--- a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
+++ b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -105,7 +105,7 @@ def init_model_fn(
         {'params': params_rng, 'dropout': dropout_rng},
         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
-    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_24_june.pth")
+    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_1_july.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
     return jax_utils.replicate(initial_params), None
diff --git a/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py
index 726aa8705..afd25c7d2 100644
--- a/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py
+++ b/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py
@@ -88,6 +88,7 @@ def init_model_fn(
         dropout_rate=dropout_rate,
         use_layer_norm=self.use_layer_norm,
         embedding_init_multiplier=self.embedding_init_multiplier)
+    torch.save(model.state_dict(), '/results/pytorch_base_model_criteo1tb_1_july.pth')
     self._param_shapes = param_utils.pytorch_param_shapes(model)
     self._param_types = param_utils.pytorch_param_types(self._param_shapes)
     model.to(DEVICE)
diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
index 1358e823e..7800189dd 100644
--- a/custom_pytorch_jax_converter.py
+++ b/custom_pytorch_jax_converter.py
@@ -26,6 +26,7 @@ def use_pytorch_weights_inplace(jax_params, file_name=None, replicate=False):
     # Load PyTorch state_dict
     state_dict = torch.load(file_name)
     print(state_dict.keys())
+
     # Convert PyTorch tensors to NumPy arrays
     numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
 
@@ -59,6 +60,7 @@ def deep_copy_to_cpu(pytree):
     # Load PyTorch state_dict lazily to CPU
     state_dict = torch.load(file_name, map_location='cpu')
     print(state_dict.keys())
+   
     # Convert PyTorch tensors to NumPy arrays
     numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
 
@@ -128,6 +130,7 @@ def move_to_cpu(tree):
 def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
     """Compares two JAX PyTrees of weights and logs where they differ, safely handling PMAP replication."""
     # Attempt to unreplicate if needed
+
     params1 = maybe_unreplicate(params1)
     params2 = maybe_unreplicate(params2)
 
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 4d226a2f6..10200b7f6 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -171,12 +171,14 @@ def update_params(workload: spec.Workload,
         }, global_step)
     
   # Log the number of parameters.
-  if global_step % 100 == 0 and workload.metrics_logger is not None:
-    date_ = "2025-06-14"
-    file_name = f"/results/schedule_free_test_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
+  if global_step % 100 == 0:
+    date_ = "2025-07-01"
+    file_name = f"/results/schedule_free_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
     params = use_pytorch_weights_cpu_copy(new_params, file_name=file_name, replicate=True)
     are_weights_equal(new_params, params)
     del params
+  
+  breakpoint()
 
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
diff --git a/submission_runner.py b/submission_runner.py
index 468a04c7c..5945875dc 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -372,6 +372,19 @@ def train_once(
             rng=update_rng,
             **({'train_state': MappingProxyType(train_state)}
                if needs_train_state else {}))
+        if FLAGS.framework=="pytorch" and global_step % 100 == 0:
+          if global_step > 1000:
+            import torch.distributed as dist
+            import sys
+            dist.destroy_process_group()
+            sys.exit(0)
+          # Save the PyTorch weights to a file every 100 steps.
+          date_ = datetime.date.today().strftime('%Y-%m-%d')
+          file_name = os.path.join(
+              log_dir, f'/results/schedule_free_pytorch_weights/{workload_name}_{date_}_after_{global_step}_steps.pth')
+          logging.info(f'Saving PyTorch weights to {file_name}')
+          torch.save(model_params.module.state_dict(), file_name)
+
     except spec.TrainingCompleteError:
       train_state['training_complete'] = True
     global_step += 1
@@ -383,131 +396,131 @@ def train_once(
     train_state['accumulated_submission_time'] += (
         train_step_end_time - train_state['last_step_end_time'])
 
-    # Check if submission is eligible for an untimed eval.
-    if ((train_step_end_time - train_state['last_eval_time']) >=
-        workload.eval_period_time_sec or train_state['training_complete']):
-
-      # Prepare for evaluation (timed).
-      if prepare_for_eval is not None:
-
-        with profiler.profile('Prepare for eval'):
-          del batch
-          prepare_for_eval_start_time = get_time()
-          optimizer_state, model_params, model_state = prepare_for_eval(
-              workload=workload,
-              current_param_container=model_params,
-              current_params_types=workload.model_params_types,
-              model_state=model_state,
-              hyperparameters=hyperparameters,
-              loss_type=workload.loss_type,
-              optimizer_state=optimizer_state,
-              eval_results=eval_results,
-              global_step=global_step,
-              rng=prep_eval_rng)
-          prepare_for_eval_end_time = get_time()
-
-        # Update sumbission time.
-        train_state['accumulated_submission_time'] += (
-            prepare_for_eval_end_time - prepare_for_eval_start_time)
-
-      # Check if time is remaining,
-      # use 1.5x the runtime budget for the self-tuning ruleset.
-      max_allowed_runtime_sec = (
-          workload.max_allowed_runtime_sec if FLAGS.tuning_ruleset == 'external'
-          else 1.5 * workload.max_allowed_runtime_sec)
-      train_state['is_time_remaining'] = (
-          train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
-
-      # Eval if time is remaining (untimed).
-      if train_state['is_time_remaining']:
-
-        with profiler.profile('Evaluation'):
-          _reset_cuda_mem()
-
-          try:
-            eval_start_time = get_time()
-            latest_eval_result = workload.eval_model(global_eval_batch_size,
-                                                     model_params,
-                                                     model_state,
-                                                     eval_rng,
-                                                     data_dir,
-                                                     imagenet_v2_data_dir,
-                                                     global_step)
-            # Check if targets reached.
-            # Note that this is one of the stopping conditions for the length of
-            # a training run. To score the run we only consider the time
-            # to validation target retrospectively.
-            train_state['validation_goal_reached'] = (
-                workload.has_reached_validation_target(latest_eval_result) or
-                train_state['validation_goal_reached'])
-            train_state['test_goal_reached'] = (
-                workload.has_reached_test_target(latest_eval_result) or
-                train_state['test_goal_reached'])
-            goals_reached = (
-                train_state['validation_goal_reached'] and
-                train_state['test_goal_reached'])
-            # Save last eval time.
-            eval_end_time = get_time()
-            train_state['last_eval_time'] = eval_end_time
-
-            # Accumulate eval time.
-            train_state[
-                'accumulated_eval_time'] += eval_end_time - eval_start_time
-
-            # Add times to eval results for logging.
-            latest_eval_result['score'] = (
-                train_state['accumulated_submission_time'])
-            latest_eval_result[
-                'total_duration'] = eval_end_time - global_start_time
-            latest_eval_result['accumulated_submission_time'] = train_state[
-                'accumulated_submission_time']
-            latest_eval_result['accumulated_eval_time'] = train_state[
-                'accumulated_eval_time']
-            latest_eval_result['accumulated_logging_time'] = train_state[
-                'accumulated_logging_time']
-            time_since_start = latest_eval_result['total_duration']
-            logging.info(f'Time since start: {time_since_start:.2f}s, '
-                         f'\tStep: {global_step}, \t{latest_eval_result}')
-            eval_results.append((global_step, latest_eval_result))
-
-            logging_start_time = get_time()
-
-            if log_dir is not None and RANK == 0:
-              metrics_logger.append_scalar_metrics(
-                  latest_eval_result,
-                  global_step=global_step,
-                  preemption_count=preemption_count,
-                  is_eval=True,
-              )
-              if save_checkpoints:
-                checkpoint_utils.save_checkpoint(
-                    framework=FLAGS.framework,
-                    optimizer_state=optimizer_state,
-                    model_params=model_params,
-                    model_state=model_state,
-                    train_state=train_state,
-                    eval_results=eval_results,
-                    global_step=global_step,
-                    preemption_count=preemption_count,
-                    checkpoint_dir=log_dir,
-                    save_intermediate_checkpoints=FLAGS
-                    .save_intermediate_checkpoints)
-
-            logging_end_time = get_time()
-            train_state['accumulated_logging_time'] += (
-                logging_end_time - logging_start_time)
-
-            _reset_cuda_mem()
-
-          except RuntimeError as e:
-            logging.exception(f'Eval step {global_step} error.\n')
-            if 'out of memory' in str(e):
-              logging.warning(
-                  'Error: GPU out of memory during eval during step '
-                  f'{global_step}, error : {str(e)}.')
-              _reset_cuda_mem()
-
-    train_state['last_step_end_time'] = get_time()
+  #   # Check if submission is eligible for an untimed eval.
+  #   if ((train_step_end_time - train_state['last_eval_time']) >=
+  #       workload.eval_period_time_sec or train_state['training_complete']):
+
+  #     # Prepare for evaluation (timed).
+  #     if prepare_for_eval is not None:
+
+  #       with profiler.profile('Prepare for eval'):
+  #         del batch
+  #         prepare_for_eval_start_time = get_time()
+  #         optimizer_state, model_params, model_state = prepare_for_eval(
+  #             workload=workload,
+  #             current_param_container=model_params,
+  #             current_params_types=workload.model_params_types,
+  #             model_state=model_state,
+  #             hyperparameters=hyperparameters,
+  #             loss_type=workload.loss_type,
+  #             optimizer_state=optimizer_state,
+  #             eval_results=eval_results,
+  #             global_step=global_step,
+  #             rng=prep_eval_rng)
+  #         prepare_for_eval_end_time = get_time()
+
+  #       # Update sumbission time.
+  #       train_state['accumulated_submission_time'] += (
+  #           prepare_for_eval_end_time - prepare_for_eval_start_time)
+
+  #     # Check if time is remaining,
+  #     # use 1.5x the runtime budget for the self-tuning ruleset.
+  #     max_allowed_runtime_sec = (
+  #         workload.max_allowed_runtime_sec if FLAGS.tuning_ruleset == 'external'
+  #         else 1.5 * workload.max_allowed_runtime_sec)
+  #     train_state['is_time_remaining'] = (
+  #         train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
+
+  #     # Eval if time is remaining (untimed).
+  #     if train_state['is_time_remaining']:
+
+  #       with profiler.profile('Evaluation'):
+  #         _reset_cuda_mem()
+
+  #         try:
+  #           eval_start_time = get_time()
+  #           latest_eval_result = workload.eval_model(global_eval_batch_size,
+  #                                                    model_params,
+  #                                                    model_state,
+  #                                                    eval_rng,
+  #                                                    data_dir,
+  #                                                    imagenet_v2_data_dir,
+  #                                                    global_step)
+  #           # Check if targets reached.
+  #           # Note that this is one of the stopping conditions for the length of
+  #           # a training run. To score the run we only consider the time
+  #           # to validation target retrospectively.
+  #           train_state['validation_goal_reached'] = (
+  #               workload.has_reached_validation_target(latest_eval_result) or
+  #               train_state['validation_goal_reached'])
+  #           train_state['test_goal_reached'] = (
+  #               workload.has_reached_test_target(latest_eval_result) or
+  #               train_state['test_goal_reached'])
+  #           goals_reached = (
+  #               train_state['validation_goal_reached'] and
+  #               train_state['test_goal_reached'])
+  #           # Save last eval time.
+  #           eval_end_time = get_time()
+  #           train_state['last_eval_time'] = eval_end_time
+
+  #           # Accumulate eval time.
+  #           train_state[
+  #               'accumulated_eval_time'] += eval_end_time - eval_start_time
+
+  #           # Add times to eval results for logging.
+  #           latest_eval_result['score'] = (
+  #               train_state['accumulated_submission_time'])
+  #           latest_eval_result[
+  #               'total_duration'] = eval_end_time - global_start_time
+  #           latest_eval_result['accumulated_submission_time'] = train_state[
+  #               'accumulated_submission_time']
+  #           latest_eval_result['accumulated_eval_time'] = train_state[
+  #               'accumulated_eval_time']
+  #           latest_eval_result['accumulated_logging_time'] = train_state[
+  #               'accumulated_logging_time']
+  #           time_since_start = latest_eval_result['total_duration']
+  #           logging.info(f'Time since start: {time_since_start:.2f}s, '
+  #                        f'\tStep: {global_step}, \t{latest_eval_result}')
+  #           eval_results.append((global_step, latest_eval_result))
+
+  #           logging_start_time = get_time()
+
+  #           if log_dir is not None and RANK == 0:
+  #             metrics_logger.append_scalar_metrics(
+  #                 latest_eval_result,
+  #                 global_step=global_step,
+  #                 preemption_count=preemption_count,
+  #                 is_eval=True,
+  #             )
+  #             if save_checkpoints:
+  #               checkpoint_utils.save_checkpoint(
+  #                   framework=FLAGS.framework,
+  #                   optimizer_state=optimizer_state,
+  #                   model_params=model_params,
+  #                   model_state=model_state,
+  #                   train_state=train_state,
+  #                   eval_results=eval_results,
+  #                   global_step=global_step,
+  #                   preemption_count=preemption_count,
+  #                   checkpoint_dir=log_dir,
+  #                   save_intermediate_checkpoints=FLAGS
+  #                   .save_intermediate_checkpoints)
+
+  #           logging_end_time = get_time()
+  #           train_state['accumulated_logging_time'] += (
+  #               logging_end_time - logging_start_time)
+
+  #           _reset_cuda_mem()
+
+  #         except RuntimeError as e:
+  #           logging.exception(f'Eval step {global_step} error.\n')
+  #           if 'out of memory' in str(e):
+  #             logging.warning(
+  #                 'Error: GPU out of memory during eval during step '
+  #                 f'{global_step}, error : {str(e)}.')
+  #             _reset_cuda_mem()
+
+  #   train_state['last_step_end_time'] = get_time()
 
   metrics = {'eval_results': eval_results, 'global_step': global_step}
 

From 7a0f8db5d9ce26c92ddc2c926e5c165fe3924235 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Wed, 2 Jul 2025 15:22:03 +0000
Subject: [PATCH 7/9] fix: cleaning up the code

---
 .../criteo1tb/criteo1tb_jax/workload.py       |   4 +-
 custom_pytorch_jax_converter.py               | 110 +++---------------
 .../schedule_free/jax/submission.py           |   6 +-
 3 files changed, 17 insertions(+), 103 deletions(-)

diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
index 8d98f09ea..da2a4d4b7 100644
--- a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
+++ b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -13,7 +13,7 @@
 from algoperf.workloads.criteo1tb.criteo1tb_jax import models
 from algoperf.workloads.criteo1tb.workload import \
     BaseCriteo1TbDlrmSmallWorkload
-from custom_pytorch_jax_converter import use_pytorch_weights_inplace
+from custom_pytorch_jax_converter import use_pytorch_weights
 
 
 
@@ -105,7 +105,7 @@ def init_model_fn(
         {'params': params_rng, 'dropout': dropout_rng},
         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
-    initial_params = use_pytorch_weights_inplace(initial_params, file_name="/results/pytorch_base_model_criteo1tb_1_july.pth")
+    initial_params = use_pytorch_weights(file_name="/results/pytorch_base_model_criteo1tb_1_july.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
     return jax_utils.replicate(initial_params), None
diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
index 7800189dd..fbef09ac0 100644
--- a/custom_pytorch_jax_converter.py
+++ b/custom_pytorch_jax_converter.py
@@ -21,100 +21,40 @@
 The function assumes that the Jax model parameters are already initialized
 and that the PyTorch weights are in the correct format.
 """
-def use_pytorch_weights_inplace(jax_params, file_name=None, replicate=False):
 
-    # Load PyTorch state_dict
-    state_dict = torch.load(file_name)
-    print(state_dict.keys())
-
-    # Convert PyTorch tensors to NumPy arrays
-    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
-
-    # --- Embedding Table ---
-    embedding_table = np.concatenate([
-        numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
-    ], axis=0)  # adjust axis depending on chunking direction
-
-    jax_params['embedding_table'] = jnp.array(embedding_table)
-
-    # --- Bot MLP: Dense_0 to Dense_2 ---
-    for i, j in zip([0, 2, 4], range(3)):
-        jax_params[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
-        jax_params[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
-
-    # --- Top MLP: Dense_3 to Dense_7 ---
-    for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
-        jax_params[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
-        jax_params[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
-   
-    del state_dict
-    return jax_params
+def use_pytorch_weights(file_name: str):
+    jax_copy = {}
 
-
-def use_pytorch_weights_cpu_copy(jax_params, file_name=None, replicate=False):
-
-    def deep_copy_to_cpu(pytree):
-        return tree_map(lambda x: jax.device_put(jnp.array(copy.deepcopy(x)), device=jax.devices("cpu")[0]), pytree)
-
-    jax_copy = deep_copy_to_cpu(jax_params) 
     # Load PyTorch state_dict lazily to CPU
     state_dict = torch.load(file_name, map_location='cpu')
     print(state_dict.keys())
-   
+
     # Convert PyTorch tensors to NumPy arrays
-    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}
 
     # --- Embedding Table ---
     embedding_table = np.concatenate([
         numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
-    ], axis=0)  # adjust axis depending on chunking direction
+    ], axis=0)  # adjust axis if chunking is not vertical
 
     jax_copy['embedding_table'] = jnp.array(embedding_table)
 
     # --- Bot MLP: Dense_0 to Dense_2 ---
     for i, j in zip([0, 2, 4], range(3)):
+        jax_copy[f'Dense_{j}'] = {}
         jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'bot_mlp.{i}.weight'].T)
         jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'bot_mlp.{i}.bias'])
 
     # --- Top MLP: Dense_3 to Dense_7 ---
     for i, j in zip([0, 2, 4, 6, 8], range(3, 8)):
+        jax_copy[f'Dense_{j}'] = {}
         jax_copy[f'Dense_{j}']['kernel'] = jnp.array(numpy_weights[f'top_mlp.{i}.weight'].T)
         jax_copy[f'Dense_{j}']['bias'] = jnp.array(numpy_weights[f'top_mlp.{i}.bias'])
-    #jax_copy = tree_map(lambda x: jnp.array(x), jax_copy) 
-    del state_dict
 
+    del state_dict
     return jax_copy
 
 
-def use_pytorch_weights_inplace_mnist(jax_params, file_name=None, replicate=False):
-    # Load the PyTorch checkpoint
-    ckpt = torch.load(file_name)
-    state_dict = ckpt['state_dict'] if 'state_dict' in ckpt else ckpt
-
-    print("Loaded PyTorch keys:", state_dict.keys())
-
-    # Convert to numpy
-    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}
-
-    # Mapping PyTorch keys → JAX Dense layers
-    layer_map = {
-        'net.layer1': 'Dense_0',
-        'net.layer2': 'Dense_1',
-    }
-
-    for pt_name, jax_name in layer_map.items():
-        weight_key = f"{pt_name}.weight"
-        bias_key = f"{pt_name}.bias"
-
-        if weight_key not in numpy_weights or bias_key not in numpy_weights:
-            raise KeyError(f"Missing keys: {weight_key} or {bias_key} in PyTorch weights")
-
-        jax_params[jax_name]['kernel'] = jnp.array(numpy_weights[weight_key].T)  # Transpose!
-        jax_params[jax_name]['bias'] = jnp.array(numpy_weights[bias_key])
-
-    return jax_params
-
-
 def maybe_unreplicate(pytree):
     """If leading axis matches device count, strip it assuming it's pmap replication."""
     num_devices = jax.device_count()
@@ -123,6 +63,7 @@ def maybe_unreplicate(pytree):
         pytree
     )
 
+
 def move_to_cpu(tree):
     return jax.tree_util.tree_map(lambda x: jax.device_put(x, device=jax.devices("cpu")[0]), tree)
 
@@ -143,7 +84,7 @@ def compare_fn(p1, p2):
         nonlocal all_equal
         if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
             logging.info("❌ Mismatch found:")
-            logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
+            logging.info(f"Shape : {p1.shape}, Shape 2: {p2.shape}")
             logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
             all_equal = False
         return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
@@ -156,31 +97,6 @@ def compare_fn(p1, p2):
 
     if all_equal:
         logging.info("✅ All weights are equal (within tolerance)")
-    return all_equal
-
-
-
-# def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
-#     """Compares two JAX PyTrees of weights and prints where they differ."""
-#     all_equal = True
-
-#     def compare_fn(p1, p2):
-#         nonlocal all_equal
-#         #if not jnp.allclose(p1, p2):
-#         if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
-#             logging.info("❌ Mismatch found:")
-#             logging.info(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
-#             logging.info(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
-#             all_equal = False
-#         return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
-
-#     try:
-#         _ = jax.tree_util.tree_map(compare_fn, params1, params2)
-#     except Exception as e:
-#         logging.info("❌ Structure mismatch or error during comparison:", e)
-#         return False
-
-#     if all_equal:
-#         logging.info("✅ All weights are equal (within tolerance)")
-#     return all_equal
-
+    del params1
+    del params2
+    return all_equal
\ No newline at end of file
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 10200b7f6..7eb48c244 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -10,7 +10,7 @@
 import jax.numpy as jnp
 from optax.contrib import schedule_free_adamw
 from algoperf import spec
-from custom_pytorch_jax_converter import use_pytorch_weights_cpu_copy, are_weights_equal
+from custom_pytorch_jax_converter import use_pytorch_weights, are_weights_equal
 
 _GRAD_CLIP_EPS = 1e-6
 
@@ -174,11 +174,9 @@ def update_params(workload: spec.Workload,
   if global_step % 100 == 0:
     date_ = "2025-07-01"
     file_name = f"/results/schedule_free_pytorch_weights/criteo1tb_{date_}_after_{global_step}_steps.pth"
-    params = use_pytorch_weights_cpu_copy(new_params, file_name=file_name, replicate=True)
+    params = use_pytorch_weights(file_name=file_name)
     are_weights_equal(new_params, params)
     del params
-  
-  breakpoint()
 
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 

From 8bbe5e2d96017a60cdc3eedf6e281e97c99ae8b5 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Wed, 2 Jul 2025 15:26:20 +0000
Subject: [PATCH 8/9] better docstring for the use_pytorch_weights function

---
 custom_pytorch_jax_converter.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
index fbef09ac0..e5c723919 100644
--- a/custom_pytorch_jax_converter.py
+++ b/custom_pytorch_jax_converter.py
@@ -7,22 +7,19 @@
 import copy
 from jax.tree_util import tree_map
 
-"""
-Jax default parameter structure:
-dict_keys(['Dense_0', 'Dense_1', 'Dense_2', 'Dense_3', 'Dense_4', 'Dense_5', 'Dense_6', 'Dense_7', 'embedding_table'])
-
-Pytorch stateduct structure:
-dict_keys(['embedding_chunk_0', 'embedding_chunk_1', 'embedding_chunk_2', 'embedding_chunk_3', 'bot_mlp.0.weight', 'bot_mlp.0.bias', 'bot_mlp.2.weight', 'bot_mlp.2.bias', 'bot_mlp.4.weight', 'bot_mlp.4.bias', 'top_mlp.0.weight', 'top_mlp.0.bias', 'top_mlp.2.weight', 'top_mlp.2.bias', 'top_mlp.4.weight', 'top_mlp.4.bias', 'top_mlp.6.weight', 'top_mlp.6.bias', 'top_mlp.8.weight', 'top_mlp.8.bias'])
 
+def use_pytorch_weights(file_name: str):
+    """
+    Jax default parameter structure:
+    dict_keys(['Dense_0', 'Dense_1', 'Dense_2', 'Dense_3', 'Dense_4', 'Dense_5', 'Dense_6', 'Dense_7', 'embedding_table'])
 
+    Pytorch stateduct structure:
+    dict_keys(['embedding_chunk_0', 'embedding_chunk_1', 'embedding_chunk_2', 'embedding_chunk_3', 'bot_mlp.0.weight', 'bot_mlp.0.bias', 'bot_mlp.2.weight', 'bot_mlp.2.bias', 'bot_mlp.4.weight', 'bot_mlp.4.bias', 'top_mlp.0.weight', 'top_mlp.0.bias', 'top_mlp.2.weight', 'top_mlp.2.bias', 'top_mlp.4.weight', 'top_mlp.4.bias', 'top_mlp.6.weight', 'top_mlp.6.bias', 'top_mlp.8.weight', 'top_mlp.8.bias'])
 
-The following function converts the PyTorch weights to the Jax format
-and assigns them to the Jax model parameters.
-The function assumes that the Jax model parameters are already initialized
-and that the PyTorch weights are in the correct format.
-"""
 
-def use_pytorch_weights(file_name: str):
+    The following function converts the PyTorch weights to the Jax format
+    """
+    
     jax_copy = {}
 
     # Load PyTorch state_dict lazily to CPU

From da67b52231cf139fb75ba680b380605927e24de4 Mon Sep 17 00:00:00 2001
From: init-22 <isaac.patole022@gmail.com>
Date: Sun, 6 Jul 2025 07:41:39 +0000
Subject: [PATCH 9/9] removing unnecessary code

---
 reference_algorithms/schedule_free/jax/submission.py     | 6 ------
 reference_algorithms/schedule_free/pytorch/submission.py | 8 --------
 2 files changed, 14 deletions(-)

diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
index 7eb48c244..62a1c2a82 100644
--- a/reference_algorithms/schedule_free/jax/submission.py
+++ b/reference_algorithms/schedule_free/jax/submission.py
@@ -36,12 +36,6 @@ def init_optimizer_state(workload: spec.Workload,
   model_params
   del model_state
   del rng
-  lr=HPARAMS['learning_rate']
-  betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2'])
-  warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75)
-  weight_decay=HPARAMS['weight_decay']
-  weight_lr_power=HPARAMS['weight_lr_power']
-  r=HPARAMS['r']
 
   opt_init_fn, opt_update_fn = schedule_free_adamw(
       learning_rate=HPARAMS['learning_rate'],
diff --git a/reference_algorithms/schedule_free/pytorch/submission.py b/reference_algorithms/schedule_free/pytorch/submission.py
index ef3135fd8..450745737 100644
--- a/reference_algorithms/schedule_free/pytorch/submission.py
+++ b/reference_algorithms/schedule_free/pytorch/submission.py
@@ -266,14 +266,6 @@ def closure():
 
   loss = optimizer_state['optimizer'].step(closure)
 
-  # # current_state  = current_model.module.state_dict() is a workaround for DDP
-  # if global_step==1:
-  #   torch.save(current_model.module.state_dict(), "/results/pytorch_base_model_criteo1tb_11may_global_step2.pth")
-  #   import torch.distributed as dist
-  #   import sys
-  #   dist.destroy_process_group()
-  #   sys.exit(0)
-
   return (optimizer_state, current_param_container, new_model_state)