Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 64 additions & 179 deletions cpp/csp/cppnodes/statsimpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@
#include <numeric>
#include <set>
#include <type_traits>

#ifdef __linux__
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>
#endif
#include <boost/multi_index_container.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index/ranked_index.hpp>

namespace csp::cppnodes
{
Expand Down Expand Up @@ -1087,19 +1085,8 @@ class WeightedKurtosis
bool m_excess;
};

#ifdef __linux__
template<typename Comparator>
using ost = __gnu_pbds::tree<double, __gnu_pbds::null_type, Comparator, __gnu_pbds::rb_tree_tag,
__gnu_pbds::tree_order_statistics_node_update>;

template<typename Comparator>
void ost_erase( ost<Comparator> &t, double & v )
{
int rank = t.order_of_key( v );
auto it = t.find_by_order( rank );
t.erase( it );
}
#endif
template <typename Comparator>
using ost = boost::multi_index::multi_index_container<double, boost::multi_index::indexed_by<boost::multi_index::ranked_non_unique<boost::multi_index::identity<double>, Comparator>>>;

class Quantile
{
Expand Down Expand Up @@ -1142,11 +1129,7 @@ class Quantile

void remove( double x )
{
#ifdef __linux__
ost_erase( m_tree, x );
#else
m_tree.erase( m_tree.find( x ) );
#endif
}

void reset()
Expand All @@ -1165,113 +1148,60 @@ class Quantile
double target = std::get<double>( m_quants[index]._data ) * ( m_tree.size() - 1 );
int ft = floor( target );
int ct = ceil( target );
auto fIt = m_tree.get<0>().nth( ft );
auto cIt = ( ft == ct ) ? fIt : std::next( fIt );

double qtl = 0.0;
#ifdef __linux__
switch ( m_interpolation )
{
case LINEAR:
if( ft == target )
{
qtl = *m_tree.find_by_order( ft );
}
else
{
double lower = *m_tree.find_by_order( ft );
double higher = *m_tree.find_by_order( ct );
qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher;
}
break;
case LOWER:
qtl = *m_tree.find_by_order( ft );
break;
case HIGHER:
qtl = *m_tree.find_by_order( ct );
break;
case MIDPOINT:
if( ft == target )
{
qtl = *m_tree.find_by_order( ft );
}
else
{
double lower = *m_tree.find_by_order( ft );
double higher = *m_tree.find_by_order( ct );
qtl = ( higher+lower ) / 2;
}
break;
case NEAREST:
if( target - ft < ct - target )
{
qtl = *m_tree.find_by_order( ft );
}
else
{
qtl = *m_tree.find_by_order( ct );
}
break;
default:
break;
}
#else
auto it = m_tree.begin();
std::advance( it, ft );
switch ( m_interpolation )
{
case LINEAR:
if( ft == target )
{
qtl = *it;
}
else
{
double lower = *it;
double higher = *++it;
qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher;
}
break;
case LOWER:
qtl = *it;
break;
case HIGHER:
qtl = ( ft == ct ? *it : *++it );
break;
case MIDPOINT:
if( ft == target )
{
qtl = *it;
}
else
{
double lower = *it;
double higher = *++it;
qtl = ( higher+lower ) / 2;
}
break;
case NEAREST:
if( target - ft <= ct - target )
{
qtl = *it;
}
else
{
qtl = *++it;
}
break;
default:
break;
case LINEAR:
if ( ft == target )
{
qtl = *fIt;
}
else
{
double lower = *fIt;
double higher = *cIt;
qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher;
}
break;
case LOWER:
qtl = *fIt;
break;
case HIGHER:
qtl = *cIt;
break;
case MIDPOINT:
if ( ft == target )
{
qtl = *fIt;
}
else
{
double lower = *fIt;
double higher = *cIt;
qtl = ( higher + lower ) / 2;
}
break;
case NEAREST:
if ( target - ft < ct - target )
{
qtl = *fIt;
}
else
{
qtl = *cIt;
}
break;
default:
break;
}
#endif
return qtl;
}

private:

#ifdef __linux__
ost<std::less_equal<double>> m_tree;
#else
std::multiset<double> m_tree;
#endif
ost<std::less<double>> m_tree;
std::vector<Dictionary::Data> m_quants;
int64_t m_interpolation;
};
Expand Down Expand Up @@ -1359,119 +1289,74 @@ class Rank
else
{
m_lastval = x;
#ifdef __linux__
if( m_method == MAX )
m_maxtree.insert( x );
else
m_mintree.insert( x );
#else
m_tree.insert( x );
#endif
}
}

void remove( double x )
{
if( likely( !isnan( x ) ) )
{
#ifdef __linux__
if( m_method == MAX )
ost_erase( m_maxtree, x );
if ( m_method == MAX )
m_maxtree.erase ( m_maxtree.find( x ) );
else
ost_erase( m_mintree, x );
#else
m_tree.erase( m_tree.find( x ) );
#endif
m_mintree.erase ( m_mintree.find( x ) );
}
}

void reset()
{
#ifdef __linux__
if( m_method == MAX )
m_maxtree.clear();
else
m_mintree.clear();
#else
m_tree.clear();
#endif
}

double compute() const
{
// Verify tree is not empty and lastValue is valid
// Last value can only ever be NaN if the "keep" nan option is used
#ifdef __linux__
if( likely( !isnan( m_lastval ) && ( ( m_method == MAX && m_maxtree.size() > 0 ) || m_mintree.size() > 0 ) ) )
{
switch( m_method )
{
case MIN:
{
if( m_mintree.size() == 1 )
if ( m_mintree.size() == 1 )
return 0;
return m_mintree.order_of_key( m_lastval );
return m_mintree.get<0>().find_rank( m_lastval );
}
case MAX:
{
if( m_maxtree.size() == 1 )
if ( m_maxtree.size() == 1 )
return 0;
return m_maxtree.size() - 1 - m_maxtree.order_of_key( m_lastval );
return m_maxtree.size() - 1 - m_maxtree.get<0>().find_rank( m_lastval );
}
case AVG:
{
// Need to iterate to find average rank
if( m_mintree.size() == 1 )
if ( m_mintree.size() == 1 )
return 0;

int min_rank = m_mintree.order_of_key( m_lastval );
int min_rank = m_mintree.get<0>().find_rank( m_lastval );
int max_rank = min_rank;
auto it = m_mintree.find_by_order( min_rank );
auto it = m_mintree.get<0>().nth( min_rank );
it++;
for( ; it != m_mintree.end() && *it == m_lastval ; it++ ) max_rank++;
for( ; it != m_mintree.end() && *it == m_lastval ; it++ ) max_rank++; // While this is in theory O(n), in reality this loop is only interated once, since there are likely no duplicate values or very few.
return ( double )( min_rank + max_rank ) / 2;
}

default:
break;
}
}
#else
if( likely( !isnan( m_lastval ) && m_tree.size() > 0 ) )
{
switch( m_method )
{
case MIN:
{
return std::distance( m_tree.begin(), m_tree.find( m_lastval ) );
}
case MAX:
{
auto end_range = m_tree.equal_range( m_lastval ).second;
return std::distance( m_tree.begin(), std::prev( end_range ) );
}
case AVG:
{
auto range = m_tree.equal_range( m_lastval );
return std::distance( m_tree.begin(), range.first ) + ( double )std::distance( range.first, std::prev( range.second ) ) / 2;
}
default:
break;
}
}
#endif

return std::numeric_limits<double>::quiet_NaN();
}

private:

#ifdef __linux__
ost<std::less_equal<double>> m_mintree;
ost<std::greater_equal<double>> m_maxtree;
#else
std::multiset<double> m_tree;
#endif
ost<std::less<double>> m_mintree;
ost<std::greater<double>> m_maxtree;
double m_lastval;

int64_t m_method;
Expand Down
48 changes: 48 additions & 0 deletions csp/benchmarks/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Written by @AdamGlustein
import numpy as np
import time
from datetime import datetime, timedelta

import csp

st = datetime(2020, 1, 1)
N = 10_000
ARRAY_SIZE = 100
TEST_TIMES = [st + timedelta(seconds=i) for i in range(N)]
RANDOM_VALUES = [np.random.normal(size=(ARRAY_SIZE,)) for i in range(N)] # 100 element np array
DATA = list(zip(TEST_TIMES, RANDOM_VALUES))
INTERVAL = 1000
NUM_SAMPLES = 100


def g_qtl():
data = csp.curve(typ=np.ndarray, data=DATA)
median = csp.stats.median(data, interval=INTERVAL)
csp.add_graph_output("final_median", median, tick_count=1)


def g_rank():
data = csp.curve(typ=np.ndarray, data=DATA)
rank = csp.stats.rank(data, interval=INTERVAL)
csp.add_graph_output("final_rank", rank, tick_count=1)


if __name__ == "__main__":
qtl_times, rank_times = list(), list()
for i in range(NUM_SAMPLES):
start = time.time()
csp.run(g_qtl, starttime=st, endtime=timedelta(seconds=N))
post_qtl = time.time()
csp.run(g_rank, starttime=st, endtime=timedelta(seconds=N))
post_rank = time.time()

qtl_times.append(post_qtl - start)
rank_times.append(post_rank - post_qtl)
print(i)

avg_med = sum(qtl_times) / NUM_SAMPLES
avg_rank = sum(rank_times) / NUM_SAMPLES
print(
f"Average time in {NUM_SAMPLES} tests for median with {N=}, {ARRAY_SIZE=}, {INTERVAL=}: {round(avg_med, 2)} s"
)
print(f"Average time in {NUM_SAMPLES} tests for rank with {N=}, {ARRAY_SIZE=}, {INTERVAL=}: {round(avg_rank, 2)} s")
1 change: 1 addition & 0 deletions vcpkg.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"abseil",
"arrow",
"boost-beast",
"boost-multi-index",
"brotli",
"exprtk",
"gtest",
Expand Down