From e512ecfcdd6d399f10d2fca19d9b0ab6ae475904 Mon Sep 17 00:00:00 2001 From: gagan-aryan Date: Thu, 24 Jun 2021 20:24:52 +0530 Subject: [PATCH] manual benchmark to crunch dataype --- manual_benchmarks/test_datatypes_crunch.ipynb | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 manual_benchmarks/test_datatypes_crunch.ipynb diff --git a/manual_benchmarks/test_datatypes_crunch.ipynb b/manual_benchmarks/test_datatypes_crunch.ipynb new file mode 100644 index 0000000..e506679 --- /dev/null +++ b/manual_benchmarks/test_datatypes_crunch.ipynb @@ -0,0 +1,283 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "test_datatypes_crunch.ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "CqyExbrNYKcv" + }, + "source": [ + "### Import stuff" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wF4Ga_unYJ8F" + }, + "source": [ + "!pip install radis\n", + "\n", + "from radis.db.classes import get_molecule_identifier\n", + "from radis.levels.partfunc import PartFuncHAPI\n", + "\n", + "from radis.io.hitemp import fetch_hitemp\n", + "from radis.db.classes import get_molecule\n", + "from radis.phys.constants import hc_k\n", + "\n", + "import numpy as np\n", + "from numpy import exp, pi" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jhglpoCFXrpp" + }, + "source": [ + "### Function to reduce the memory usage of pandas dataframe" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CryqdLGaXdFO" + }, + "source": [ + "def reduce_mem_usage(props):\n", + " start_mem_usg = props.memory_usage().sum() / 1024**2 \n", + " print(\"Memory usage of properties dataframe is :\",start_mem_usg,\" MB\")\n", + " NAlist = [] # Keeps track of columns that have missing values filled in. \n", + " for col in props.columns:\n", + " if props[col].dtype != object: # Exclude strings\n", + " \n", + " # Print current column type\n", + " print(\"******************************\")\n", + " print(\"Column: \",col)\n", + " print(\"dtype before: \",props[col].dtype)\n", + " \n", + " # make variables for Int, max and min\n", + " IsInt = False\n", + " mx = props[col].max()\n", + " mn = props[col].min()\n", + " \n", + " # Integer does not support NA, therefore, NA needs to be filled\n", + " if not np.isfinite(props[col]).all(): \n", + " NAlist.append(col)\n", + " props[col].fillna(-1,inplace=True) \n", + " \n", + " # test if column can be converted to an integer\n", + " asint = props[col].fillna(0).astype(np.int64)\n", + " result = (props[col] - asint)\n", + " result = result.sum()\n", + " if result > -0.01 and result < 0.01:\n", + " IsInt = True\n", + "\n", + " \n", + " # Make Integer/unsigned Integer datatypes\n", + " if IsInt:\n", + " if mn >= 0:\n", + " if mx < 255:\n", + " props[col] = props[col].astype(np.uint8)\n", + " elif mx < 65535:\n", + " props[col] = props[col].astype(np.uint16)\n", + " elif mx < 4294967295:\n", + " props[col] = props[col].astype(np.uint32)\n", + " else:\n", + " props[col] = props[col].astype(np.uint64)\n", + " else:\n", + " if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:\n", + " props[col] = props[col].astype(np.int8)\n", + " elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:\n", + " props[col] = props[col].astype(np.int16)\n", + " elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:\n", + " props[col] = props[col].astype(np.int32)\n", + " elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:\n", + " props[col] = props[col].astype(np.int64) \n", + " \n", + " # Make float datatypes 32 bit\n", + " else:\n", + " props[col] = props[col].astype(np.float32)\n", + " \n", + " # Print new column type\n", + " print(\"dtype after: \",props[col].dtype)\n", + " print(\"******************************\")\n", + " \n", + " # Print final result\n", + " print(\"___MEMORY USAGE AFTER COMPLETION:___\")\n", + " mem_usg = props.memory_usage().sum() / 1024**2 \n", + " print(\"Memory usage is: \",mem_usg,\" MB\")\n", + " print(\"This is \",100*mem_usg/start_mem_usg,\"% of the initial size\")\n", + " return props, NAlist" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JIrnVcoSX1wo" + }, + "source": [ + "### Fetch Databank" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9SCd3A3qXx1h", + "outputId": "2feaaa4d-a152-4ac2-ce53-fc74634a5885" + }, + "source": [ + "df0 = fetch_hitemp(molecule='CH4', databank_name='HITEMP-CH4', isotope='1, 2, 3', load_wavenum_min=2000, load_wavenum_max=4000, local_databases='~/')" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using existing database HITEMP-CH4\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kfIfc1fSYoDd" + }, + "source": [ + "### Let's Crunch" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oAgwp-vwYIH6", + "outputId": "22309e27-3fda-4dc4-96e3-b1094f159e4c" + }, + "source": [ + "df1, NAlist = reduce_mem_usage(df0)\n", + "print(\"_________________\")\n", + "print(\"\")\n", + "print(\"Warning: the following columns have missing values filled with -1': \")\n", + "print(\"_________________\")\n", + "print(\"\")\n", + "print(NAlist)" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Memory usage of properties dataframe is : 565.2839660644531 MB\n", + "******************************\n", + "Column: id\n", + "dtype before: int64\n", + "dtype after: uint8\n", + "******************************\n", + "******************************\n", + "Column: iso\n", + "dtype before: int64\n", + "dtype after: uint8\n", + "******************************\n", + "******************************\n", + "Column: wav\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: int\n", + "dtype before: float64\n", + "dtype after: uint8\n", + "******************************\n", + "******************************\n", + "Column: A\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: airbrd\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: selbrd\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: El\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: Tdpair\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: Pshft\n", + "dtype before: float64\n", + "dtype after: float32\n", + "******************************\n", + "******************************\n", + "Column: gp\n", + "dtype before: float64\n", + "dtype after: uint16\n", + "******************************\n", + "******************************\n", + "Column: gpp\n", + "dtype before: float64\n", + "dtype after: uint16\n", + "******************************\n", + "___MEMORY USAGE AFTER COMPLETION:___\n", + "Memory usage is: 349.76945400238037 MB\n", + "This is 61.875 % of the initial size\n", + "_________________\n", + "\n", + "Warning: the following columns have missing values filled with -1': \n", + "_________________\n", + "\n", + "[]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4vUB-2bbcLoT" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file