From e512ecfcdd6d399f10d2fca19d9b0ab6ae475904 Mon Sep 17 00:00:00 2001
From: gagan-aryan <gaganaryan19@gmail.com>
Date: Thu, 24 Jun 2021 20:24:52 +0530
Subject: [PATCH] manual benchmark to crunch dataype

---
 manual_benchmarks/test_datatypes_crunch.ipynb | 283 ++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 manual_benchmarks/test_datatypes_crunch.ipynb

diff --git a/manual_benchmarks/test_datatypes_crunch.ipynb b/manual_benchmarks/test_datatypes_crunch.ipynb
new file mode 100644
index 0000000..e506679
--- /dev/null
+++ b/manual_benchmarks/test_datatypes_crunch.ipynb
@@ -0,0 +1,283 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "test_datatypes_crunch.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CqyExbrNYKcv"
+      },
+      "source": [
+        "### Import stuff"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wF4Ga_unYJ8F"
+      },
+      "source": [
+        "!pip install radis\n",
+        "\n",
+        "from radis.db.classes import get_molecule_identifier\n",
+        "from radis.levels.partfunc import PartFuncHAPI\n",
+        "\n",
+        "from radis.io.hitemp import fetch_hitemp\n",
+        "from radis.db.classes import get_molecule\n",
+        "from radis.phys.constants import hc_k\n",
+        "\n",
+        "import numpy as np\n",
+        "from numpy import exp, pi"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jhglpoCFXrpp"
+      },
+      "source": [
+        "### Function to reduce the memory usage of pandas dataframe"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "CryqdLGaXdFO"
+      },
+      "source": [
+        "def reduce_mem_usage(props):\n",
+        "    start_mem_usg = props.memory_usage().sum() / 1024**2 \n",
+        "    print(\"Memory usage of properties dataframe is :\",start_mem_usg,\" MB\")\n",
+        "    NAlist = [] # Keeps track of columns that have missing values filled in. \n",
+        "    for col in props.columns:\n",
+        "        if props[col].dtype != object:  # Exclude strings\n",
+        "            \n",
+        "            # Print current column type\n",
+        "            print(\"******************************\")\n",
+        "            print(\"Column: \",col)\n",
+        "            print(\"dtype before: \",props[col].dtype)\n",
+        "            \n",
+        "            # make variables for Int, max and min\n",
+        "            IsInt = False\n",
+        "            mx = props[col].max()\n",
+        "            mn = props[col].min()\n",
+        "            \n",
+        "            # Integer does not support NA, therefore, NA needs to be filled\n",
+        "            if not np.isfinite(props[col]).all(): \n",
+        "                NAlist.append(col)\n",
+        "                props[col].fillna(-1,inplace=True)  \n",
+        "                   \n",
+        "            # test if column can be converted to an integer\n",
+        "            asint = props[col].fillna(0).astype(np.int64)\n",
+        "            result = (props[col] - asint)\n",
+        "            result = result.sum()\n",
+        "            if result > -0.01 and result < 0.01:\n",
+        "                IsInt = True\n",
+        "\n",
+        "            \n",
+        "            # Make Integer/unsigned Integer datatypes\n",
+        "            if IsInt:\n",
+        "                if mn >= 0:\n",
+        "                    if mx < 255:\n",
+        "                        props[col] = props[col].astype(np.uint8)\n",
+        "                    elif mx < 65535:\n",
+        "                        props[col] = props[col].astype(np.uint16)\n",
+        "                    elif mx < 4294967295:\n",
+        "                        props[col] = props[col].astype(np.uint32)\n",
+        "                    else:\n",
+        "                        props[col] = props[col].astype(np.uint64)\n",
+        "                else:\n",
+        "                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:\n",
+        "                        props[col] = props[col].astype(np.int8)\n",
+        "                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:\n",
+        "                        props[col] = props[col].astype(np.int16)\n",
+        "                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:\n",
+        "                        props[col] = props[col].astype(np.int32)\n",
+        "                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:\n",
+        "                        props[col] = props[col].astype(np.int64)    \n",
+        "            \n",
+        "            # Make float datatypes 32 bit\n",
+        "            else:\n",
+        "                props[col] = props[col].astype(np.float32)\n",
+        "            \n",
+        "            # Print new column type\n",
+        "            print(\"dtype after: \",props[col].dtype)\n",
+        "            print(\"******************************\")\n",
+        "    \n",
+        "    # Print final result\n",
+        "    print(\"___MEMORY USAGE AFTER COMPLETION:___\")\n",
+        "    mem_usg = props.memory_usage().sum() / 1024**2 \n",
+        "    print(\"Memory usage is: \",mem_usg,\" MB\")\n",
+        "    print(\"This is \",100*mem_usg/start_mem_usg,\"% of the initial size\")\n",
+        "    return props, NAlist"
+      ],
+      "execution_count": 9,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JIrnVcoSX1wo"
+      },
+      "source": [
+        "### Fetch Databank"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9SCd3A3qXx1h",
+        "outputId": "2feaaa4d-a152-4ac2-ce53-fc74634a5885"
+      },
+      "source": [
+        "df0 = fetch_hitemp(molecule='CH4', databank_name='HITEMP-CH4', isotope='1, 2, 3', load_wavenum_min=2000, load_wavenum_max=4000, local_databases='~/')"
+      ],
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Using existing database HITEMP-CH4\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kfIfc1fSYoDd"
+      },
+      "source": [
+        "### Let's Crunch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "oAgwp-vwYIH6",
+        "outputId": "22309e27-3fda-4dc4-96e3-b1094f159e4c"
+      },
+      "source": [
+        "df1, NAlist = reduce_mem_usage(df0)\n",
+        "print(\"_________________\")\n",
+        "print(\"\")\n",
+        "print(\"Warning: the following columns have missing values filled with -1': \")\n",
+        "print(\"_________________\")\n",
+        "print(\"\")\n",
+        "print(NAlist)"
+      ],
+      "execution_count": 13,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Memory usage of properties dataframe is : 565.2839660644531  MB\n",
+            "******************************\n",
+            "Column:  id\n",
+            "dtype before:  int64\n",
+            "dtype after:  uint8\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  iso\n",
+            "dtype before:  int64\n",
+            "dtype after:  uint8\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  wav\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  int\n",
+            "dtype before:  float64\n",
+            "dtype after:  uint8\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  A\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  airbrd\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  selbrd\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  El\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  Tdpair\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  Pshft\n",
+            "dtype before:  float64\n",
+            "dtype after:  float32\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  gp\n",
+            "dtype before:  float64\n",
+            "dtype after:  uint16\n",
+            "******************************\n",
+            "******************************\n",
+            "Column:  gpp\n",
+            "dtype before:  float64\n",
+            "dtype after:  uint16\n",
+            "******************************\n",
+            "___MEMORY USAGE AFTER COMPLETION:___\n",
+            "Memory usage is:  349.76945400238037  MB\n",
+            "This is  61.875 % of the initial size\n",
+            "_________________\n",
+            "\n",
+            "Warning: the following columns have missing values filled with -1': \n",
+            "_________________\n",
+            "\n",
+            "[]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4vUB-2bbcLoT"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file