From d43cd3e25501f9b7ff6206a563e8e7ea22249d61 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 29 Apr 2016 13:57:29 -0400
Subject: [PATCH 001/150] Add some useful things to .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 7f877cc..f58e2ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,6 @@ _testmain.go
 *.prof
 clean.sh
 kernel/
+
+*~
+gominer

From 43e6a84f1b3294ca9398b03b743ff5c5a12a7789 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 29 Apr 2016 14:52:15 -0400
Subject: [PATCH 002/150] Switch to decred import paths.

---
 README.md | 2 +-
 device.go | 4 ++--
 miner.go  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0393da1..e2bbe86 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 You need to have OpenCL installed. To download and build gominer, run:
 
-    go get github.com/Dirbaio/gominer
+    go get github.com/decred/gominer
 
 ## Running
 
diff --git a/device.go b/device.go
index a234013..3cd9659 100644
--- a/device.go
+++ b/device.go
@@ -7,8 +7,8 @@ import (
 	"os"
 	"unsafe"
 
-	"github.com/Dirbaio/gominer/blake256"
-	"github.com/Dirbaio/gominer/cl"
+	"github.com/decred/gominer/blake256"
+	"github.com/decred/gominer/cl"
 )
 
 const (
diff --git a/miner.go b/miner.go
index 9f5b48c..dae43d8 100644
--- a/miner.go
+++ b/miner.go
@@ -5,7 +5,7 @@ import (
 	"sync"
 	"time"
 
-	"github.com/Dirbaio/gominer/cl"
+	"github.com/decred/gominer/cl"
 )
 
 const benchmark = true

From 39550c63777095d18ec0aede4afbcaaacb127ca5 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 29 Apr 2016 14:49:08 -0400
Subject: [PATCH 003/150] Enable tls and improve config.

Config changed to match dcrctl since that is closer to what the miner
does.  This removed some totally unneeded options and changed some
others.

gominer now uses its own directory for config rather than reusing the
dcrd one.

Add back btc/dcr copyright in config.go (since the whole file came
from there).

Add sample config file.
---
 README.md           |   4 +-
 config.go           | 136 +++++++++++++++++++++++++++-----------------
 getwork.go          | 133 +++++++++++++++++++++++++++++++++++++------
 sample-gominer.conf |  35 ++++++++++++
 4 files changed, 236 insertions(+), 72 deletions(-)
 create mode 100644 sample-gominer.conf

diff --git a/README.md b/README.md
index e2bbe86..efac1bb 100644
--- a/README.md
+++ b/README.md
@@ -8,12 +8,10 @@ You need to have OpenCL installed. To download and build gominer, run:
 
 ## Running
 
-No TLS support yet, sorry!
-
 Run for benchmark:
 
     gominer -B
 
 Run for real mining:
 
-    gominer -u myusername -P hunter2 -c http://localhost:9109
+    gominer -u myusername -P hunter2
diff --git a/config.go b/config.go
index 3c0c3e7..1bdcf5e 100644
--- a/config.go
+++ b/config.go
@@ -1,7 +1,11 @@
+// Copyright (c) 2013-2015 The btcsuite developers
+// Copyright (c) 2015-2016 The Decred developers
+
 package main
 
 import (
 	"fmt"
+	"net"
 	"os"
 	"path/filepath"
 	"sort"
@@ -14,18 +18,17 @@ import (
 const (
 	defaultConfigFilename = "gominer.conf"
 	defaultLogLevel       = "info"
-	defaultDataDirname    = "data"
 	defaultLogDirname     = "logs"
 	defaultLogFilename    = "gominer.log"
 )
 
 var (
-	homeDir            = dcrutil.AppDataDir("dcrd", false)
-	defaultConfigFile  = filepath.Join(homeDir, defaultConfigFilename)
-	defaultDataDir     = filepath.Join(homeDir, defaultDataDirname)
-	defaultRPCKeyFile  = filepath.Join(homeDir, "rpc.key")
-	defaultRPCCertFile = filepath.Join(homeDir, "rpc.cert")
-	defaultLogDir      = filepath.Join(homeDir, defaultLogDirname)
+	minerHomeDir       = dcrutil.AppDataDir("gominer", false)
+	dcrdHomeDir        = dcrutil.AppDataDir("dcrd", false)
+	defaultConfigFile  = filepath.Join(minerHomeDir, defaultConfigFilename)
+	defaultRPCServer   = "localhost"
+	defaultRPCCertFile = filepath.Join(dcrdHomeDir, "rpc.cert")
+	defaultLogDir      = filepath.Join(minerHomeDir, defaultLogDirname)
 )
 
 type config struct {
@@ -33,7 +36,6 @@ type config struct {
 
 	// Config / log options
 	ConfigFile string `short:"C" long:"configfile" description:"Path to configuration file"`
-	DataDir    string `short:"b" long:"datadir" description:"Directory to store wallets and transactions"`
 	LogDir     string `long:"logdir" description:"Directory to log output."`
 	DebugLevel string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`
 
@@ -43,14 +45,40 @@ type config struct {
 	MemProfile string `long:"memprofile" description:"Write mem profile to the specified file"`
 
 	// RPC connection options
-	RPCConnect string `short:"c" long:"rpcconnect" description:"Hostname/IP and port of dcrd RPC server to connect to (default localhost:19109, mainnet: localhost:9109, simnet: localhost:19556)"`
-	Username   string `short:"u" long:"username" description:"Username for client and dcrd authorization"`
-	Password   string `short:"P" long:"password" default-mask:"-" description:"Password for client and dcrd authorization"`
-	CAFile     string `long:"cafile" description:"File containing root certificates to authenticate a TLS connections with dcrd"`
-	RPCCert    string `long:"rpccert" description:"File containing the certificate file"`
-	RPCKey     string `long:"rpckey" description:"File containing the certificate key"`
-	DisableTLS bool   `long:"notls" description:"Disable TLS for the RPC client -- NOTE: This is only allowed if the RPC client is connecting to localhost"`
-	Benchmark  bool   `short:"B" long:"benchmark" description:"Run in benchmark mode."`
+	RPCUser     string `short:"u" long:"rpcuser" description:"RPC username"`
+	RPCPassword string `short:"P" long:"rpcpass" default-mask:"-" description:"RPC password"`
+	RPCServer   string `short:"s" long:"rpcserver" description:"RPC server to connect to"`
+	RPCCert     string `short:"c" long:"rpccert" description:"RPC server certificate chain for validation"`
+	NoTLS       bool   `long:"notls" description:"Disable TLS"`
+	Proxy       string `long:"proxy" description:"Connect via SOCKS5 proxy (eg. 127.0.0.1:9050)"`
+	ProxyUser   string `long:"proxyuser" description:"Username for proxy server"`
+	ProxyPass   string `long:"proxypass" default-mask:"-" description:"Password for proxy server"`
+
+	Benchmark bool `short:"B" long:"benchmark" description:"Run in benchmark mode."`
+
+	TestNet       bool `long:"testnet" description:"Connect to testnet"`
+	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
+	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
+}
+
+// normalizeAddress returns addr with the passed default port appended if
+// there is not already a port specified.
+func normalizeAddress(addr string, useTestNet, useSimNet bool) string {
+	_, _, err := net.SplitHostPort(addr)
+	if err != nil {
+		var defaultPort string
+		switch {
+		case useTestNet:
+			defaultPort = "19109"
+		case useSimNet:
+			defaultPort = "19556"
+		default:
+			defaultPort = "9109"
+		}
+
+		return net.JoinHostPort(addr, defaultPort)
+	}
+	return addr
 }
 
 // filesExists reports whether the named file or directory exists.
@@ -147,6 +175,20 @@ func parseAndSetDebugLevels(debugLevel string) error {
 	return nil
 }
 
+// cleanAndExpandPath expands environement variables and leading ~ in the
+// passed path, cleans the result, and returns it.
+func cleanAndExpandPath(path string) string {
+	// Expand initial ~ to OS specific home directory.
+	if strings.HasPrefix(path, "~") {
+		homeDir := filepath.Dir(minerHomeDir)
+		path = strings.Replace(path, "~", homeDir, 1)
+	}
+
+	// NOTE: The os.ExpandEnv doesn't work with Windows-style %VARIABLE%,
+	// but they variables can still be expanded via POSIX-style $VARIABLE.
+	return filepath.Clean(os.ExpandEnv(path))
+}
+
 // loadConfig initializes and parses the config using a config file and command
 // line options.
 //
@@ -164,22 +206,24 @@ func loadConfig() (*config, []string, error) {
 	cfg := config{
 		ConfigFile: defaultConfigFile,
 		DebugLevel: defaultLogLevel,
-		DataDir:    defaultDataDir,
 		LogDir:     defaultLogDir,
-		RPCKey:     defaultRPCKeyFile,
+		RPCServer:  defaultRPCServer,
 		RPCCert:    defaultRPCCertFile,
 	}
 
-	// A config file in the current directory takes precedence.
-	if fileExists(defaultConfigFilename) {
-		cfg.ConfigFile = defaultConfigFile
+	// Create the home directory if it doesn't already exist.
+	funcName := "loadConfig"
+	err := os.MkdirAll(minerHomeDir, 0700)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%v\n", err)
+		os.Exit(-1)
 	}
 
 	// Pre-parse the command line options to see if an alternative config
 	// file or the version flag was specified.
 	preCfg := cfg
 	preParser := flags.NewParser(&preCfg, flags.Default)
-	_, err := preParser.Parse()
+	_, err = preParser.Parse()
 	if err != nil {
 		if e, ok := err.(*flags.Error); !ok || e.Type != flags.ErrHelp {
 			preParser.WriteHelp(os.Stderr)
@@ -188,7 +232,6 @@ func loadConfig() (*config, []string, error) {
 	}
 
 	// Show the version and exit if the version flag was specified.
-	funcName := "loadConfig"
 	appName := filepath.Base(os.Args[0])
 	appName = strings.TrimSuffix(appName, filepath.Ext(appName))
 	usageMessage := fmt.Sprintf("Use %s -h to show usage", appName)
@@ -219,33 +262,18 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
-	// If an alternate data directory was specified, and paths with defaults
-	// relative to the data dir are unchanged, modify each path to be
-	// relative to the new data dir.
-	if cfg.DataDir != defaultDataDir {
-		if cfg.RPCKey == defaultRPCKeyFile {
-			cfg.RPCKey = filepath.Join(cfg.DataDir, "rpc.key")
-		}
-		if cfg.RPCCert == defaultRPCCertFile {
-			cfg.RPCCert = filepath.Join(cfg.DataDir, "rpc.cert")
-		}
+	// Multiple networks can't be selected simultaneously.
+	numNets := 0
+	if cfg.TestNet {
+		numNets++
 	}
-
-	// Create the home directory if it doesn't already exist.
-	err = os.MkdirAll(homeDir, 0700)
-	if err != nil {
-		// Show a nicer error message if it's because a symlink is
-		// linked to a directory that does not exist (probably because
-		// it's not mounted).
-		if e, ok := err.(*os.PathError); ok && os.IsExist(err) {
-			if link, lerr := os.Readlink(e.Path); lerr == nil {
-				str := "is symlink %s -> %s mounted?"
-				err = fmt.Errorf(str, e.Path, link)
-			}
-		}
-
-		str := "%s: Failed to create home directory: %v"
-		err := fmt.Errorf(str, funcName, err)
+	if cfg.SimNet {
+		numNets++
+	}
+	if numNets > 1 {
+		str := "%s: The testnet and simnet params can't be used " +
+			"together -- choose one of the two"
+		err := fmt.Errorf(str, "loadConfig")
 		fmt.Fprintln(os.Stderr, err)
 		return nil, nil, err
 	}
@@ -268,9 +296,13 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
-	if cfg.RPCConnect == "" {
-		cfg.RPCConnect = "http://localhost:9109"
-	}
+	// Handle environment variable expansion in the RPC certificate path.
+	cfg.RPCCert = cleanAndExpandPath(cfg.RPCCert)
+
+	// Add default port to RPC server based on --testnet flag
+	// if needed.
+	cfg.RPCServer = normalizeAddress(cfg.RPCServer, cfg.TestNet,
+		cfg.SimNet)
 
 	// Warn about missing config file only after all other configuration is
 	// done.  This prevents the warning on help messages and invalid
diff --git a/getwork.go b/getwork.go
index dce511b..5d15834 100644
--- a/getwork.go
+++ b/getwork.go
@@ -2,15 +2,66 @@ package main
 
 import (
 	"bytes"
-	"encoding/base64"
+	"crypto/tls"
+	"crypto/x509"
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"net"
 	"net/http"
 	"time"
+
+	"github.com/btcsuite/go-socks/socks"
 )
 
+// newHTTPClient returns a new HTTP client that is configured according to the
+// proxy and TLS settings in the associated connection configuration.
+func newHTTPClient(cfg *config) (*http.Client, error) {
+	// Configure proxy if needed.
+	var dial func(network, addr string) (net.Conn, error)
+	if cfg.Proxy != "" {
+		proxy := &socks.Proxy{
+			Addr:     cfg.Proxy,
+			Username: cfg.ProxyUser,
+			Password: cfg.ProxyPass,
+		}
+		dial = func(network, addr string) (net.Conn, error) {
+			c, err := proxy.Dial(network, addr)
+			if err != nil {
+				return nil, err
+			}
+			return c, nil
+		}
+	}
+
+	// Configure TLS if needed.
+	var tlsConfig *tls.Config
+	if !cfg.NoTLS && cfg.RPCCert != "" {
+		pem, err := ioutil.ReadFile(cfg.RPCCert)
+		if err != nil {
+			return nil, err
+		}
+
+		pool := x509.NewCertPool()
+		pool.AppendCertsFromPEM(pem)
+		tlsConfig = &tls.Config{
+			RootCAs:            pool,
+			InsecureSkipVerify: cfg.TLSSkipVerify,
+		}
+	}
+
+	// Create and return the new HTTP client potentially configured with a
+	// proxy and TLS.
+	client := http.Client{
+		Transport: &http.Transport{
+			Dial:            dial,
+			TLSClientConfig: tlsConfig,
+		},
+	}
+	return &client, nil
+}
+
 type getWorkResponseJson struct {
 	Result struct {
 		Data   string
@@ -58,20 +109,44 @@ func createHTTPClient() *http.Client {
 
 // GetWork makes a getwork RPC call and returns the result (data and target)
 func GetWork() (*Work, error) {
+	// Generate a request to the configured RPC server.
+	protocol := "http"
+	if !cfg.NoTLS {
+		protocol = "https"
+	}
+	url := protocol + "://" + cfg.RPCServer
 	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": [], "id": 1}`)
-	req, err := http.NewRequest("POST", cfg.RPCConnect, bytes.NewBuffer(jsonStr))
-	req.Header.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(cfg.Username+":"+cfg.Password)))
-	req.Header.Set("Content-Type", "application/json")
+	bodyBuff := bytes.NewBuffer(jsonStr)
+	httpRequest, err := http.NewRequest("POST", url, bodyBuff)
+	if err != nil {
+		return nil, err
+	}
+	httpRequest.Close = true
+	httpRequest.Header.Set("Content-Type", "application/json")
+
+	// Configure basic access authorization.
+	httpRequest.SetBasicAuth(cfg.RPCUser, cfg.RPCPassword)
+
+	// Create the new HTTP client that is configured according to the user-
+	// specified options and submit the request.
+	httpClient, err := newHTTPClient(cfg)
+	if err != nil {
+		return nil, err
+	}
+	httpResponse, err := httpClient.Do(httpRequest)
+	if err != nil {
+		return nil, err
+	}
 
-	resp, err := httpClient.Do(req)
+	body, err := ioutil.ReadAll(httpResponse.Body)
+	httpResponse.Body.Close()
 	if err != nil {
+		err = fmt.Errorf("error reading json reply: %v", err)
 		return nil, err
 	}
-	defer resp.Body.Close()
 
-	body, _ := ioutil.ReadAll(resp.Body)
-	if resp.Status != "200 OK" {
-		return nil, fmt.Errorf("HTTP %s: %s", resp.Status, body)
+	if httpResponse.Status != "200 OK" {
+		return nil, fmt.Errorf("HTTP %s: %s", httpResponse.Status, body)
 	}
 
 	var res getWorkResponseJson
@@ -107,21 +182,45 @@ func GetWork() (*Work, error) {
 
 // GetWork makes a getwork RPC call and returns the result (data and target)
 func GetWorkSubmit(data []byte) (bool, error) {
+	// Generate a request to the configured RPC server.
+	protocol := "http"
+	if !cfg.NoTLS {
+		protocol = "https"
+	}
+	url := protocol + "://" + cfg.RPCServer
 	hexData := hex.EncodeToString(data)
 	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": ["` + hexData + `"], "id": 1}`)
-	req, err := http.NewRequest("POST", cfg.RPCConnect, bytes.NewBuffer(jsonStr))
-	req.Header.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(cfg.Username+":"+cfg.Password)))
-	req.Header.Set("Content-Type", "application/json")
+	bodyBuff := bytes.NewBuffer(jsonStr)
+	httpRequest, err := http.NewRequest("POST", url, bodyBuff)
+	if err != nil {
+		return false, err
+	}
+	httpRequest.Close = true
+	httpRequest.Header.Set("Content-Type", "application/json")
+
+	// Configure basic access authorization.
+	httpRequest.SetBasicAuth(cfg.RPCUser, cfg.RPCPassword)
+
+	// Create the new HTTP client that is configured according to the user-
+	// specified options and submit the request.
+	httpClient, err := newHTTPClient(cfg)
+	if err != nil {
+		return false, err
+	}
+	httpResponse, err := httpClient.Do(httpRequest)
+	if err != nil {
+		return false, err
+	}
 
-	resp, err := httpClient.Do(req)
+	body, err := ioutil.ReadAll(httpResponse.Body)
+	httpResponse.Body.Close()
 	if err != nil {
+		err = fmt.Errorf("error reading json reply: %v", err)
 		return false, err
 	}
-	defer resp.Body.Close()
 
-	body, _ := ioutil.ReadAll(resp.Body)
-	if resp.Status != "200 OK" {
-		return false, fmt.Errorf("error calling getwork (%s): %s", resp.Status, body)
+	if httpResponse.Status != "200 OK" {
+		return false, fmt.Errorf("error calling getwork (%s): %s", httpResponse.Status, body)
 	}
 
 	var res getWorkSubmitResponseJson
diff --git a/sample-gominer.conf b/sample-gominer.conf
new file mode 100644
index 0000000..e752eed
--- /dev/null
+++ b/sample-gominer.conf
@@ -0,0 +1,35 @@
+[Application Options]
+
+; ------------------------------------------------------------------------------
+; Network settings
+; ------------------------------------------------------------------------------
+
+; Use testnet (cannot be used with simnet=1).
+; testnet=1
+
+; Use simnet (cannot be used with testnet=1).
+; simnet=1
+
+
+; ------------------------------------------------------------------------------
+; RPC client settings
+; ------------------------------------------------------------------------------
+
+; Connect via a SOCKS5 proxy.
+; proxy=127.0.0.1:9050
+; proxyuser=
+; proxypass=
+
+; Username and password to authenticate connections to a Decred RPC server
+; (usually dcrd)
+; rpcuser=
+; rpcpass=
+
+; RPC server to connect to
+; rpcserver=localhost
+
+; RPC server certificate chain file for validation
+; rpccert=~/.dcrd/rpc.cert
+
+; Disable tls for rpc
+; notls=1

From 6a7a59bfd784e29faef0e159a9b6b505cc3710d8 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 5 May 2016 09:35:50 -0400
Subject: [PATCH 004/150] Merge pull request #5 from jcvernaleo/jcv_const

Remove unused constant
---
 miner.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/miner.go b/miner.go
index dae43d8..4e006eb 100644
--- a/miner.go
+++ b/miner.go
@@ -8,8 +8,6 @@ import (
 	"github.com/decred/gominer/cl"
 )
 
-const benchmark = true
-
 func getCLPlatforms() ([]cl.CL_platform_id, error) {
 	var numPlatforms cl.CL_uint
 	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)

From b935bbdf763bfa9bf9060a5ad3dd16ac74314d5f Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 5 May 2016 09:36:07 -0400
Subject: [PATCH 005/150] Fix it so CTRL-C actually stops miner. (#4)

The Miner.Stop() function now decrements the waitgroup
each time a device is stopped so it doesn't hang
waiting on things that have already stopped.
---
 miner.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/miner.go b/miner.go
index 4e006eb..f53a11f 100644
--- a/miner.go
+++ b/miner.go
@@ -174,5 +174,6 @@ func (m *Miner) Stop() {
 	close(m.quit)
 	for _, d := range m.devices {
 		d.Stop()
+		m.wg.Done()
 	}
 }

From 6b8bb1f1e5bdebbbc1d42c1b1e008de0109260f9 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 6 May 2016 11:49:34 -0400
Subject: [PATCH 006/150] Add -i/--intensity.

This makes intensity a tuneable parameter.

Leave the default at the initial 2^26 value.
---
 config.go           | 14 ++++++++++++++
 device.go           |  6 ++++--
 sample-gominer.conf |  6 ++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/config.go b/config.go
index 1bdcf5e..f44f835 100644
--- a/config.go
+++ b/config.go
@@ -29,6 +29,10 @@ var (
 	defaultRPCServer   = "localhost"
 	defaultRPCCertFile = filepath.Join(dcrdHomeDir, "rpc.cert")
 	defaultLogDir      = filepath.Join(minerHomeDir, defaultLogDirname)
+	defaultIntensity   = 26
+	// Took these values from cgminer.
+	minIntensity = 8
+	maxIntensity = 31
 )
 
 type config struct {
@@ -59,6 +63,8 @@ type config struct {
 	TestNet       bool `long:"testnet" description:"Connect to testnet"`
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
+
+	Intensity int `short:"i" long:"intensity" description:"Intensity."`
 }
 
 // normalizeAddress returns addr with the passed default port appended if
@@ -209,6 +215,7 @@ func loadConfig() (*config, []string, error) {
 		LogDir:     defaultLogDir,
 		RPCServer:  defaultRPCServer,
 		RPCCert:    defaultRPCCertFile,
+		Intensity:  defaultIntensity,
 	}
 
 	// Create the home directory if it doesn't already exist.
@@ -278,6 +285,13 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
+	if (cfg.Intensity < minIntensity) || (cfg.Intensity > maxIntensity) {
+		err := fmt.Errorf("Intensity %v not without range %v to %v.",
+			cfg.Intensity, minIntensity, maxIntensity)
+		fmt.Fprintln(os.Stderr, err)
+		return nil, nil, err
+	}
+
 	// Special show command to list supported subsystems and exit.
 	if cfg.DebugLevel == "show" {
 		fmt.Println("Supported subsystems", supportedSubsystems())
diff --git a/device.go b/device.go
index 3cd9659..6d5a96c 100644
--- a/device.go
+++ b/device.go
@@ -4,6 +4,7 @@ import (
 	"encoding/binary"
 	"encoding/hex"
 	"fmt"
+	"math"
 	"os"
 	"unsafe"
 
@@ -13,7 +14,6 @@ import (
 
 const (
 	outputBufferSize = cl.CL_size_t(64)
-	globalWorksize   = 65536 * 1024
 	localWorksize    = 64
 	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
 
@@ -229,6 +229,8 @@ func (d *Device) Run() {
 func (d *Device) runDevice() error {
 	minrLog.Infof("Started GPU #%d", d.index)
 	outputData := make([]uint32, outputBufferSize)
+	globalWorksize := math.Exp2(float64(cfg.Intensity))
+	minrLog.Debugf("Intensity %v", cfg.Intensity)
 	var status cl.CL_int
 	for {
 		d.updateCurrentWork()
@@ -280,7 +282,7 @@ func (d *Device) runDevice() error {
 
 		// Execute the kernel
 		var globalWorkSize [1]cl.CL_size_t
-		globalWorkSize[0] = globalWorksize
+		globalWorkSize[0] = cl.CL_size_t(globalWorksize)
 		var localWorkSize [1]cl.CL_size_t
 		localWorkSize[0] = localWorksize
 		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil, globalWorkSize[:], localWorkSize[:], 0, nil, nil)
diff --git a/sample-gominer.conf b/sample-gominer.conf
index e752eed..9034b24 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -33,3 +33,9 @@
 
 ; Disable tls for rpc
 ; notls=1
+
+; ------------------------------------------------------------------------------
+; Mining settings
+; ------------------------------------------------------------------------------
+
+; intensity=26

From 55b4c6abb406f273b722d2c96621b6dfe95d3c5f Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 5 May 2016 13:32:30 -0400
Subject: [PATCH 007/150] Add support for stratum pools.

This is still a work in progress.

This allows gominer to connect to a stratum enabled pool using the
options:
gominer -o=stratum+tcp://pool:port -n userid -n password

GetWork style data is generated from the stratum data.

A very minimal test server (notify.go) is included to provide
fixed data to send to miners for debug purposes.  This can be extended
for additional testing later.

Pools are ignored in benchmark mode.

Misc other changes:

Add devicename to output.

Slow down hashmeter printing.

Check hash vs target using dcrd code and big.Int rather
than the code that was in gominer.

Contains work by jcv and jolan.
---
 config.go        |    5 +
 device.go        |   61 ++-
 getwork.go       |   63 +++
 log.go           |    4 +
 miner.go         |   56 ++-
 notify/notify.go |   63 +++
 stratum.go       | 1008 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1239 insertions(+), 21 deletions(-)
 create mode 100644 notify/notify.go
 create mode 100644 stratum.go

diff --git a/config.go b/config.go
index f44f835..f17dd04 100644
--- a/config.go
+++ b/config.go
@@ -65,6 +65,11 @@ type config struct {
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
 	Intensity int `short:"i" long:"intensity" description:"Intensity."`
+
+	// Pool related options
+	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port) "`
+	PoolUser     string `short:"m" long:"pooluser" description:"Pool username"`
+	PoolPassword string `short:"n" long:"poolpass" default-mask:"-" description:"Pool password"`
 }
 
 // normalizeAddress returns addr with the passed default port appended if
diff --git a/device.go b/device.go
index 6d5a96c..9a22730 100644
--- a/device.go
+++ b/device.go
@@ -5,9 +5,13 @@ import (
 	"encoding/hex"
 	"fmt"
 	"math"
+	"math/big"
 	"os"
 	"unsafe"
 
+	"github.com/decred/dcrd/blockchain"
+	"github.com/decred/dcrd/chaincfg/chainhash"
+
 	"github.com/decred/gominer/blake256"
 	"github.com/decred/gominer/cl"
 )
@@ -58,6 +62,7 @@ type Device struct {
 	index        int
 	platformID   cl.CL_platform_id
 	deviceID     cl.CL_device_id
+	deviceName   string
 	context      cl.CL_context
 	queue        cl.CL_command_queue
 	outputBuffer cl.CL_mem
@@ -102,6 +107,7 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 		index:      index,
 		platformID: platformID,
 		deviceID:   deviceID,
+		deviceName: getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
 		quit:       make(chan struct{}),
 		newWork:    make(chan *Work, 5),
 		workDone:   workDone,
@@ -227,7 +233,7 @@ func (d *Device) Run() {
 }
 
 func (d *Device) runDevice() error {
-	minrLog.Infof("Started GPU #%d", d.index)
+	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 	globalWorksize := math.Exp2(float64(cfg.Intensity))
 	minrLog.Debugf("Intensity %v", cfg.Intensity)
@@ -323,7 +329,17 @@ func (d *Device) foundCandidate(nonce1 uint32, nonce0 uint32) {
 		binary.BigEndian.PutUint32(hash[i*4:], state[i])
 	}
 
-	if hashSmaller(hash[:], d.work.Target[:]) {
+	newHash, err := chainhash.NewHashFromStr(hex.EncodeToString(hash[:]))
+	if err != nil {
+		minrLog.Error(err)
+	}
+	hashNum := blockchain.ShaHashToBig(newHash)
+	target := new(big.Int)
+	target.SetString(hex.EncodeToString(d.work.Target[:]), 16)
+	if hashNum.Cmp(target) > 0 {
+		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(hash[:]), hex.EncodeToString(d.work.Target[:]))
+
+	} else {
 		minrLog.Infof("Found hash!!  %s", hex.EncodeToString(hash[:]))
 		d.workDone <- data
 	}
@@ -339,21 +355,48 @@ func (d *Device) SetWork(w *Work) {
 
 func formatHashrate(h float64) string {
 	if h > 1000000000 {
-		return fmt.Sprintf("%.3f GH/s", h/1000000000)
+		return fmt.Sprintf("%.1fGH/s", h/1000000000)
 	} else if h > 1000000 {
-		return fmt.Sprintf("%.3f MH/s", h/1000000)
+		return fmt.Sprintf("%.0fMH/s", h/1000000)
 	} else if h > 1000 {
-		return fmt.Sprintf("%.3f kH/s", h/1000)
-	} else {
-		return fmt.Sprintf("%.3f GH/s", h)
+		return fmt.Sprintf("%.1fkH/s", h/1000)
+	} else if h == 0 {
+		return "0H/s"
 	}
+
+	return fmt.Sprintf("%.1f GH/s", h)
+}
+
+func getDeviceInfo(id cl.CL_device_id,
+	name cl.CL_device_info,
+	str string) string {
+
+	var errNum cl.CL_int
+	var paramValueSize cl.CL_size_t
+
+	errNum = cl.CLGetDeviceInfo(id, name, 0, nil, &paramValueSize)
+
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	var info interface{}
+	errNum = cl.CLGetDeviceInfo(id, name, paramValueSize, &info, nil)
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	strinfo := fmt.Sprintf("%v", info)
+
+	return strinfo
 }
 
 func (d *Device) PrintStats() {
 	alpha := 0.95
 	d.workDoneEMA = d.workDoneEMA*alpha + d.workDoneLast*(1-alpha)
 	d.workDoneLast = 0
-	d.runningTime += 1.0
+	d.runningTime += 5.0
 
-	minrLog.Infof("EMA %s, avg %s", formatHashrate(d.workDoneEMA), formatHashrate(d.workDoneTotal/d.runningTime))
+	minrLog.Infof("GPU #%d: %s, EMA %s avg %s", d.index, d.deviceName,
+		formatHashrate(d.workDoneEMA), formatHashrate(d.workDoneTotal/d.runningTime))
 }
diff --git a/getwork.go b/getwork.go
index 5d15834..440fb1a 100644
--- a/getwork.go
+++ b/getwork.go
@@ -10,6 +10,7 @@ import (
 	"io/ioutil"
 	"net"
 	"net/http"
+	"strconv"
 	"time"
 
 	"github.com/btcsuite/go-socks/socks"
@@ -180,6 +181,37 @@ func GetWork() (*Work, error) {
 	return &w, nil
 }
 
+// GetPoolWork gets work from a stratum enabled pool
+func GetPoolWork(pool *Stratum) (*Work, error) {
+	// Get Next work for stratum and mark it as used
+	if pool.PoolWork.NewWork {
+		poolLog.Info("Received new work from pool.")
+		// Mark used
+		pool.PoolWork.NewWork = false
+
+		if pool.PoolWork.JobID == "" {
+			return nil, fmt.Errorf("No work available (no job id)")
+		}
+
+		err := pool.PrepWork()
+		if err != nil {
+			return nil, err
+		}
+
+		intJob, _ := strconv.ParseInt(pool.PoolWork.JobID, 16, 0)
+		poolLog.Infof("job %v height %v", intJob, pool.PoolWork.Height)
+
+		return pool.PoolWork.Work, nil
+	}
+
+	// Return the work we already had, do not recalculate
+	if pool.PoolWork.Work != nil {
+		return pool.PoolWork.Work, nil
+	}
+
+	return nil, fmt.Errorf("No work available.")
+}
+
 // GetWork makes a getwork RPC call and returns the result (data and target)
 func GetWorkSubmit(data []byte) (bool, error) {
 	// Generate a request to the configured RPC server.
@@ -235,3 +267,34 @@ func GetWorkSubmit(data []byte) (bool, error) {
 
 	return res.Result, nil
 }
+
+// GetPoolWorkSubmit sends the result to the stratum enabled pool
+func GetPoolWorkSubmit(data []byte, pool *Stratum) (bool, error) {
+
+	sub, err := pool.PrepSubmit(data)
+	if err != nil {
+		return false, err
+	}
+
+	// json encode
+	m, err := json.Marshal(sub)
+	if err != nil {
+		return false, err
+	}
+
+	// send
+	poolLog.Tracef("> %s", m)
+	_, err = pool.Conn.Write(m)
+	if err != nil {
+		return false, err
+	}
+	_, err = pool.Conn.Write([]byte("\n"))
+	if err != nil {
+		return false, err
+	}
+	pool.submitted = true
+
+	pool.PoolWork.Work = nil
+
+	return false, nil
+}
diff --git a/log.go b/log.go
index 1b838cd..a5a79d8 100644
--- a/log.go
+++ b/log.go
@@ -12,11 +12,13 @@ var (
 	backendLog = seelog.Disabled
 	mainLog    = btclog.Disabled
 	minrLog    = btclog.Disabled
+	poolLog    = btclog.Disabled
 )
 
 var subsystemLoggers = map[string]btclog.Logger{
 	"MAIN": mainLog,
 	"MINR": minrLog,
+	"POOL": poolLog,
 }
 
 // useLogger updates the logger references for subsystemID to logger.  Invalid
@@ -32,6 +34,8 @@ func useLogger(subsystemID string, logger btclog.Logger) {
 		mainLog = logger
 	case "MINR":
 		minrLog = logger
+	case "POOL":
+		poolLog = logger
 	}
 }
 
diff --git a/miner.go b/miner.go
index f53a11f..479ca27 100644
--- a/miner.go
+++ b/miner.go
@@ -43,6 +43,7 @@ type Miner struct {
 	quit             chan struct{}
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
+	pool             *Stratum
 }
 
 func NewMiner() (*Miner, error) {
@@ -52,6 +53,15 @@ func NewMiner() (*Miner, error) {
 		needsWorkRefresh: make(chan struct{}),
 	}
 
+	// If needed, start pool code.
+	if cfg.Pool != "" && !cfg.Benchmark {
+		s, err := StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword)
+		if err != nil {
+			return nil, err
+		}
+		m.pool = s
+	}
+
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
 		return nil, fmt.Errorf("Could not get CL platforms: %v", err)
@@ -82,12 +92,23 @@ func (m *Miner) workSubmitThread() {
 		case <-m.quit:
 			return
 		case data := <-m.workDone:
-			accepted, err := GetWorkSubmit(data)
-			if err != nil {
-				minrLog.Errorf("Error submitting work: %v", err)
+			// Only use that is we are not using a pool.
+			if m.pool == nil {
+				accepted, err := GetWorkSubmit(data)
+				if err != nil {
+					minrLog.Errorf("Error submitting work: %v", err)
+				} else {
+					minrLog.Errorf("Submitted work successfully: %v", accepted)
+					m.needsWorkRefresh <- struct{}{}
+				}
 			} else {
-				minrLog.Errorf("Submitted work successfully: %v", accepted)
-				m.needsWorkRefresh <- struct{}{}
+				accepted, err := GetPoolWorkSubmit(data, m.pool)
+				if err != nil {
+					minrLog.Errorf("Error submitting work to pool: %v", err)
+				} else {
+					minrLog.Errorf("Submitted work to pool successfully: %v", accepted)
+					m.needsWorkRefresh <- struct{}{}
+				}
 			}
 		}
 	}
@@ -100,15 +121,26 @@ func (m *Miner) workRefreshThread() {
 	defer t.Stop()
 
 	for {
-		work, err := GetWork()
-		if err != nil {
-			minrLog.Errorf("Error in getwork: %v", err)
+		// Only use that is we are not using a pool.
+		if m.pool == nil {
+			work, err := GetWork()
+			if err != nil {
+				minrLog.Errorf("Error in getwork: %v", err)
+			} else {
+				for _, d := range m.devices {
+					d.SetWork(work)
+				}
+			}
 		} else {
-			for _, d := range m.devices {
-				d.SetWork(work)
+			work, err := GetPoolWork(m.pool)
+			if err != nil {
+				minrLog.Errorf("Error in getpoolwork: %v", err)
+			} else {
+				for _, d := range m.devices {
+					d.SetWork(work)
+				}
 			}
 		}
-
 		select {
 		case <-m.quit:
 			return
@@ -121,7 +153,7 @@ func (m *Miner) workRefreshThread() {
 func (m *Miner) printStatsThread() {
 	defer m.wg.Done()
 
-	t := time.NewTicker(time.Second)
+	t := time.NewTicker(time.Second * 5)
 	defer t.Stop()
 
 	for {
diff --git a/notify/notify.go b/notify/notify.go
new file mode 100644
index 0000000..2cfc6d6
--- /dev/null
+++ b/notify/notify.go
@@ -0,0 +1,63 @@
+// Copyright (c) 2016 The Decred developers
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+
+// This is a simple server to provide static responses similar to a stratum
+// server for debug purposes.
+
+package main
+
+import (
+	"fmt"
+	"net"
+)
+
+func main() {
+	ln, err := net.Listen("tcp", ":2222")
+	if err != nil {
+		fmt.Println(err)
+	}
+	for {
+		conn, err := ln.Accept()
+		if err != nil {
+			fmt.Println(err)
+		}
+		go handleConnection(conn)
+	}
+
+}
+
+func handleConnection(c net.Conn) {
+	msg1 := `{"id":1,"result":[[["mining.set_difficulty","deadbeefcafebabecc7e1c0000000000"],["mining.notify","deadbeefcafebabecc7e1c0000000000"]],"00000000000000000fe43fbb",12],"error":null}`
+	msg2 := `{"id":null,"method":"mining.set_difficulty","params":[8]}`
+	msg3 := `{"id":null,"method":"mining.notify","params":["bb3b","6ea8e28a4b172946d743eb3382785120990fe73c247e3dbd000004fc00000000","b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca76570000000000000000","",[],"01000000","1a17f8d9","5776cadb",true]}`
+	// WorkData generated from that should be:
+	// 010000008ae2a86e4629174b33eb43d7205178823ce70f99bd3d7e24fc04000000000000b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca7657000000000000000000188fec0fe43fbb000000000000000000000000000000000000000000000000
+
+	buf := make([]byte, 1024)
+	_, err := c.Read(buf)
+	if err != nil {
+		fmt.Println("Error reading:", err.Error())
+	}
+
+	fmt.Println(string(buf))
+
+	send("subscribe reply", []byte(msg1), c)
+	send("difficulty", []byte(msg2), c)
+	send("notify", []byte(msg3), c)
+
+	//c.Close()
+}
+
+func send(mType string, m []byte, c net.Conn) {
+	fmt.Println("Sending ", mType)
+	_, err := c.Write(m)
+	if err != nil {
+		fmt.Println(err)
+	}
+	_, err = c.Write([]byte("\n"))
+	if err != nil {
+		fmt.Println(err)
+	}
+
+}
diff --git a/stratum.go b/stratum.go
new file mode 100644
index 0000000..f0a879d
--- /dev/null
+++ b/stratum.go
@@ -0,0 +1,1008 @@
+// Copyright (c) 2016 The Decred developers
+
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"crypto/rand"
+	"encoding/binary"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"math/big"
+	"net"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/davecgh/go-spew/spew"
+
+	"github.com/decred/dcrd/wire"
+)
+
+// Stratum holds all the shared information for a stratum connection.
+// XXX most of these should be unexported and use getters/setters.
+type Stratum struct {
+	Pool      string
+	User      string
+	Pass      string
+	Conn      net.Conn
+	Reader    *bufio.Reader
+	ID        uint64
+	authID    uint64
+	subID     uint64
+	submitID  uint64
+	Diff      float64
+	Target    string
+	submitted bool
+	PoolWork  NotifyWork
+}
+
+// NotifyWork holds all the info recieved from a mining.notify message along
+// with the Work data generate from it.
+type NotifyWork struct {
+	Clean             bool
+	ExtraNonce1       string
+	ExtraNonce2       uint64
+	ExtraNonce2Length float64
+	CB1               string
+	CB2               string
+	Height            int64
+	NtimeDelta        int64
+	JobID             string
+	Hash              string
+	Nbits             string
+	Ntime             string
+	Version           string
+	NewWork           bool
+	Work              *Work
+}
+
+// StratumMsg is the basic message object from stratum.
+type StratumMsg struct {
+	Method string `json:"method"`
+	// Need to make generic.
+	Params []string    `json:"params"`
+	ID     interface{} `json:"id"`
+}
+
+// StratumRsp is the basic response type from stratum.
+type StratumRsp struct {
+	Method string `json:"method"`
+	// Need to make generic.
+	ID     interface{}      `json:"id"`
+	Error  StratErr         `json:"error,omitempty"`
+	Result *json.RawMessage `json:"result,omitempty"`
+}
+
+// StratErr is the basic error type (a number and a string) sent by
+// the stratum server.
+type StratErr struct {
+	ErrNum uint64
+	ErrStr string
+}
+
+// Basic reply is a reply type for any of the simple messages.
+type BasicReply struct {
+	ID     interface{} `json:"id"`
+	Error  StratErr    `json:"error,omitempty"`
+	Result bool        `json:"result"`
+}
+
+// SubscribeReply models the server response to a subscribe message.
+type SubscribeReply struct {
+	SubscribeID       string
+	ExtraNonce1       string
+	ExtraNonce2Length float64
+}
+
+// NotifyRes models the json from a mining.notify message.
+type NotifyRes struct {
+	JobID          string
+	Hash           string
+	GenTX1         string
+	GenTX2         string
+	MerkleBranches []string
+	BlockVersion   string
+	Nbits          string
+	Ntime          string
+	CleanJobs      bool
+}
+
+// Submit models a submission message.
+type Submit struct {
+	Method string      `json:"method"`
+	Params []string    `json:"params"`
+	ID     interface{} `json:"id"`
+}
+
+// errJsonType is an error for json that we do not expect.
+var errJsonType = errors.New("Unexpected type in json.")
+
+// StratumConn starts the initial connection to a stratum pool and sets defaults
+// in the pool object.
+func StratumConn(pool, user, pass string) (*Stratum, error) {
+	poolLog.Infof("Using pool: %v", pool)
+	proto := "stratum+tcp://"
+	if strings.HasPrefix(pool, proto) {
+		pool = strings.Replace(pool, proto, "", 1)
+	} else {
+		err := errors.New("Only stratum pools supported.")
+		return nil, err
+	}
+	conn, err := net.Dial("tcp", pool)
+	if err != nil {
+		return nil, err
+	}
+	var stratum Stratum
+	stratum.ID = 1
+	stratum.Conn = conn
+	stratum.Pool = pool
+	stratum.User = user
+	stratum.Pass = pass
+	// We will set it for sure later but this really should be the value and
+	// setting it here will prevent so incorrect matches based on the
+	// default 0 value.
+	stratum.authID = 2
+	// Target for share is 1 unless we hear otherwise.
+	stratum.Diff = 1
+	stratum.Target = stratum.diffToTarget(stratum.Diff)
+	stratum.PoolWork.NewWork = false
+	stratum.Reader = bufio.NewReader(stratum.Conn)
+	go stratum.Listen()
+
+	err = stratum.Subscribe()
+	if err != nil {
+		return nil, err
+	}
+	// Should NOT need this.
+	//time.Sleep(5 * time.Second)
+	err = stratum.Auth()
+	if err != nil {
+		return nil, err
+	}
+
+	return &stratum, nil
+}
+
+// Reconnect reconnects to a stratum server if the connection has been lost.
+func (s *Stratum) Reconnect() error {
+	conn, err := net.Dial("tcp", s.Pool)
+	if err != nil {
+		return err
+	}
+	s.Conn = conn
+	s.Reader = bufio.NewReader(s.Conn)
+	err = s.Subscribe()
+	if err != nil {
+		return nil
+	}
+	// Should NOT need this.
+	time.Sleep(5 * time.Second)
+	// XXX Do I really need to re-auth here?
+	err = s.Auth()
+	if err != nil {
+		return nil
+	}
+	return nil
+}
+
+// Listen is the listener for the incoming messages from the stratum pool.
+func (s *Stratum) Listen() {
+	poolLog.Debug("Starting Listener")
+
+	for {
+		result, err := s.Reader.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				poolLog.Error("Connection lost!  Reconnecting.")
+				err = s.Reconnect()
+				if err != nil {
+					poolLog.Error(err)
+					poolLog.Error("Reconnect failed.")
+					os.Exit(1)
+					return
+				}
+
+			} else {
+				poolLog.Error(err)
+			}
+			continue
+		}
+		poolLog.Debug(strings.TrimSuffix(result, "\n"))
+		resp, err := s.Unmarshal([]byte(result))
+		if err != nil {
+			poolLog.Error(err)
+			continue
+		}
+		switch resp.(type) {
+		case *BasicReply:
+			aResp := resp.(*BasicReply)
+			if int(aResp.ID.(uint64)) == int(s.authID) {
+				if aResp.Result {
+					poolLog.Info("Logged in")
+				} else {
+					poolLog.Error("Auth failure.")
+				}
+			}
+			if aResp.ID == s.submitID {
+				if aResp.Result {
+					poolLog.Info("Share Accepted")
+				} else {
+					poolLog.Error("Share rejected: ", aResp.Error.ErrStr)
+				}
+				s.submitted = false
+			}
+		case StratumMsg:
+			nResp := resp.(StratumMsg)
+			poolLog.Trace(nResp)
+			// Too much is still handled in unmarshaler.  Need to
+			// move stuff other than unmarshalling here.
+			switch nResp.Method {
+			case "client.show_message":
+				poolLog.Info(nResp.Params)
+			case "client.reconnect":
+				poolLog.Info("Reconnect requested")
+				wait, err := strconv.Atoi(nResp.Params[2])
+				if err != nil {
+					poolLog.Error(err)
+					continue
+				}
+				time.Sleep(time.Duration(wait) * time.Second)
+				pool := nResp.Params[0] + ":" + nResp.Params[1]
+				s.Pool = pool
+				err = s.Reconnect()
+				if err != nil {
+					poolLog.Error(err)
+					// XXX should just die at this point
+					// but we don't really have access to
+					// the channel to end everything.
+					return
+				}
+			case "client.get_version":
+				poolLog.Debug("get_version request received.")
+				msg := StratumMsg{
+					Method: nResp.Method,
+					ID:     nResp.ID,
+					Params: []string{"decred-gominer/" + version()},
+				}
+				m, err := json.Marshal(msg)
+				if err != nil {
+					poolLog.Error(err)
+					continue
+				}
+				_, err = s.Conn.Write(m)
+				if err != nil {
+					poolLog.Error(err)
+					continue
+				}
+				_, err = s.Conn.Write([]byte("\n"))
+				if err != nil {
+					poolLog.Error(err)
+					continue
+				}
+			}
+		case NotifyRes:
+			nResp := resp.(NotifyRes)
+			s.PoolWork.JobID = nResp.JobID
+			s.PoolWork.CB1 = nResp.GenTX1
+			//poolLog.Trace("CB1: " + spew.Sdump(s.PoolWork.CB1))
+			//height := nResp.GenTX1[184:188]
+			heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
+			height, err := strconv.ParseInt(heightHex, 16, 32)
+			if err != nil {
+				poolLog.Tracef("failed to parse height %v", err)
+				height = 0
+			}
+			s.PoolWork.Height = height
+			s.PoolWork.CB2 = nResp.GenTX2
+			s.PoolWork.Hash = nResp.Hash
+			s.PoolWork.Nbits = nResp.Nbits
+			s.PoolWork.Version = nResp.BlockVersion
+			parsedNtime, err := strconv.ParseInt(nResp.Ntime, 16, 64)
+			if err != nil {
+				poolLog.Error(err)
+			}
+			s.PoolWork.Ntime = nResp.Ntime
+			s.PoolWork.NtimeDelta = parsedNtime - time.Now().Unix()
+			s.PoolWork.Clean = nResp.CleanJobs
+			s.PoolWork.NewWork = true
+			poolLog.Trace("notify: ", spew.Sdump(nResp))
+		case *SubscribeReply:
+			nResp := resp.(*SubscribeReply)
+			s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
+			s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
+			poolLog.Info("Subscribe reply received.")
+			poolLog.Trace(spew.Sdump(resp))
+		default:
+			poolLog.Info("Unhandled message: ", result)
+		}
+	}
+}
+
+// Auth sends a message to the pool to authorize a worker.
+func (s *Stratum) Auth() error {
+	msg := StratumMsg{
+		Method: "mining.authorize",
+		ID:     s.ID,
+		Params: []string{s.User, s.Pass},
+	}
+	// Auth reply has no method so need a way to identify it.
+	// Ugly, but not much choise.
+	id, ok := msg.ID.(uint64)
+	if !ok {
+		return errJsonType
+	}
+	s.authID = id
+	s.ID += 1
+	poolLog.Tracef("> %v", msg)
+	m, err := json.Marshal(msg)
+	if err != nil {
+		return err
+	}
+	_, err = s.Conn.Write(m)
+	if err != nil {
+		return err
+	}
+	_, err = s.Conn.Write([]byte("\n"))
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// Subscribe sends the subscribe message to get mining info for a worker.
+func (s *Stratum) Subscribe() error {
+	msg := StratumMsg{
+		Method: "mining.subscribe",
+		ID:     s.ID,
+		Params: []string{"decred-gominer/" + version()},
+	}
+	s.subID = msg.ID.(uint64)
+	s.ID++
+	m, err := json.Marshal(msg)
+	if err != nil {
+		return err
+	}
+	poolLog.Tracef("> %v", string(m))
+	_, err = s.Conn.Write(m)
+	if err != nil {
+		return err
+	}
+	_, err = s.Conn.Write([]byte("\n"))
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// Unmarshal provides a json umnarshaler for the commands.
+// I'm sure a lot of this can be generalized but the json we deal with
+// is pretty yucky.
+func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
+	var (
+		objmap map[string]json.RawMessage
+		method string
+		id     uint64
+	)
+
+	err := json.Unmarshal(blob, &objmap)
+	if err != nil {
+		return nil, err
+	}
+	// decode command
+	// Not everyone has a method
+	err = json.Unmarshal(objmap["method"], &method)
+	if err != nil {
+		method = ""
+	}
+	err = json.Unmarshal(objmap["id"], &id)
+	if err != nil {
+		return nil, err
+	}
+	poolLog.Trace("Received: method: ", method, " id: ", id)
+	if id == s.authID {
+		var (
+			objmap      map[string]json.RawMessage
+			id          uint64
+			result      bool
+			errorHolder []interface{}
+		)
+		err := json.Unmarshal(blob, &objmap)
+		if err != nil {
+			return nil, err
+		}
+		resp := &BasicReply{}
+
+		err = json.Unmarshal(objmap["id"], &id)
+		if err != nil {
+			return nil, err
+		}
+		resp.ID = id
+
+		err = json.Unmarshal(objmap["result"], &result)
+		if err != nil {
+			return nil, err
+		}
+		err = json.Unmarshal(objmap["error"], &errorHolder)
+		if err != nil {
+			return nil, err
+		}
+		resp.Result = result
+
+		if errorHolder != nil {
+			errN, ok := errorHolder[0].(float64)
+			if !ok {
+				return nil, errJsonType
+			}
+			errS, ok := errorHolder[1].(string)
+			if !ok {
+				return nil, errJsonType
+			}
+			resp.Error.ErrNum = uint64(errN)
+			resp.Error.ErrStr = errS
+		}
+
+		return resp, nil
+
+	}
+	if id == s.subID {
+		var resi []interface{}
+		err := json.Unmarshal(objmap["result"], &resi)
+		if err != nil {
+			return nil, err
+		}
+		poolLog.Trace(resi)
+		resp := &SubscribeReply{}
+
+		var objmap2 map[string]json.RawMessage
+		err = json.Unmarshal(blob, &objmap2)
+		if err != nil {
+			return nil, err
+		}
+
+		var resJS []json.RawMessage
+		err = json.Unmarshal(objmap["result"], &resJS)
+		if err != nil {
+			return nil, err
+		}
+
+		var msgPeak []interface{}
+		err = json.Unmarshal(resJS[0], &msgPeak)
+		if err != nil {
+			return nil, err
+		}
+
+		// The pools do not all agree on what this message looks like
+		// so we need to actually look at it before unmarshalling for
+		// real so we can use the right form.  Yuck.
+		if msgPeak[0] == "mining.notify" {
+			var innerMsg []string
+			err = json.Unmarshal(resJS[0], &innerMsg)
+			if err != nil {
+				return nil, err
+			}
+			resp.SubscribeID = innerMsg[1]
+		} else {
+			var innerMsg [][]string
+			err = json.Unmarshal(resJS[0], &innerMsg)
+			if err != nil {
+				return nil, err
+			}
+			for i := 0; i < len(innerMsg); i++ {
+				if innerMsg[i][0] == "mining.notify" {
+					resp.SubscribeID = innerMsg[i][1]
+				}
+				if innerMsg[i][0] == "mining.set_difficulty" {
+					// Not all pools correctly put something
+					// in here so we will ignore it (we
+					// already have the default value of 1
+					// anyway and pool can send a new one.
+					// dcr.coinmine.pl puts something that
+					// is not a difficulty here which is why
+					// we ignore.
+				}
+			}
+
+		}
+		resp.ExtraNonce1 = resi[1].(string)
+		resp.ExtraNonce2Length = resi[2].(float64)
+		return resp, nil
+	}
+	if id == s.submitID && s.submitted {
+		var (
+			objmap      map[string]json.RawMessage
+			id          uint64
+			result      bool
+			errorHolder []interface{}
+		)
+		err := json.Unmarshal(blob, &objmap)
+		if err != nil {
+			return nil, err
+		}
+		resp := &BasicReply{}
+
+		err = json.Unmarshal(objmap["id"], &id)
+		if err != nil {
+			return nil, err
+		}
+		resp.ID = id
+
+		err = json.Unmarshal(objmap["result"], &result)
+		if err != nil {
+			return nil, err
+		}
+		err = json.Unmarshal(objmap["error"], &errorHolder)
+		if err != nil {
+			return nil, err
+		}
+		resp.Result = result
+
+		if errorHolder != nil {
+			errN, ok := errorHolder[0].(float64)
+			if !ok {
+				return nil, errJsonType
+			}
+			errS, ok := errorHolder[1].(string)
+			if !ok {
+				return nil, errJsonType
+			}
+			resp.Error.ErrNum = uint64(errN)
+			resp.Error.ErrStr = errS
+		}
+
+		return resp, nil
+	}
+	switch method {
+	case "mining.notify":
+		poolLog.Trace("Unmarshal mining.notify")
+		var resi []interface{}
+		err := json.Unmarshal(objmap["params"], &resi)
+		if err != nil {
+			return nil, err
+		}
+		poolLog.Trace(resi)
+		var nres = NotifyRes{}
+		jobID, ok := resi[0].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.JobID = jobID
+		hash, ok := resi[1].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.Hash = hash
+		genTX1, ok := resi[2].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.GenTX1 = genTX1
+		genTX2, ok := resi[3].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.GenTX2 = genTX2
+		//ccminer code also confirms this
+		//nres.MerkleBranches = resi[4].([]string)
+		blockVersion, ok := resi[5].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.BlockVersion = blockVersion
+		nbits, ok := resi[6].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.Nbits = nbits
+		ntime, ok := resi[7].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.Ntime = ntime
+		cleanJobs, ok := resi[8].(bool)
+		if !ok {
+			return nil, errJsonType
+		}
+		nres.CleanJobs = cleanJobs
+		return nres, nil
+	case "mining.set_difficulty":
+		poolLog.Trace("Received new difficulty.")
+		var resi []interface{}
+		err := json.Unmarshal(objmap["params"], &resi)
+		if err != nil {
+			return nil, err
+		}
+
+		difficulty, ok := resi[0].(float64)
+		if !ok {
+			return nil, errJsonType
+		}
+		s.Target = s.diffToTarget(difficulty)
+		s.Diff = difficulty
+		var nres = StratumMsg{}
+		nres.Method = method
+		diffStr := strconv.FormatFloat(difficulty, 'E', -1, 32)
+		var params []string
+		params = append(params, diffStr)
+		nres.Params = params
+		poolLog.Infof("Stratum difficulty set to %v", difficulty)
+		return nres, nil
+	case "client.show_message":
+		var resi []interface{}
+		err := json.Unmarshal(objmap["result"], &resi)
+		if err != nil {
+			return nil, err
+		}
+		msg, ok := resi[0].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		var nres = StratumMsg{}
+		nres.Method = method
+		var params []string
+		params = append(params, msg)
+		nres.Params = params
+		return nres, nil
+	case "client.get_version":
+		var nres = StratumMsg{}
+		var id uint64
+		err = json.Unmarshal(objmap["id"], &id)
+		if err != nil {
+			return nil, err
+		}
+		nres.Method = method
+		nres.ID = id
+		return nres, nil
+	case "client.reconnect":
+		var nres = StratumMsg{}
+		var id uint64
+		err = json.Unmarshal(objmap["id"], &id)
+		if err != nil {
+			return nil, err
+		}
+		nres.Method = method
+		nres.ID = id
+
+		var resi []interface{}
+		err := json.Unmarshal(objmap["params"], &resi)
+		if err != nil {
+			return nil, err
+		}
+		poolLog.Trace(resi)
+
+		if len(resi) < 3 {
+			return nil, errJsonType
+		}
+		hostname, ok := resi[0].(string)
+		if !ok {
+			return nil, errJsonType
+		}
+		p, ok := resi[1].(float64)
+		if !ok {
+			return nil, errJsonType
+		}
+		port := strconv.Itoa(int(p))
+		w, ok := resi[2].(float64)
+		if !ok {
+			return nil, errJsonType
+		}
+		wait := strconv.Itoa(int(w))
+
+		nres.Params = []string{hostname, port, wait}
+
+		return nres, nil
+	default:
+		resp := &StratumRsp{}
+		err := json.Unmarshal(blob, &resp)
+		if err != nil {
+			return nil, err
+		}
+		return resp, nil
+	}
+}
+
+// PrepWork converts the stratum notify to getwork style data for mining.
+func (s *Stratum) PrepWork() error {
+
+	// Build final extranonce
+	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
+	if err != nil {
+		poolLog.Error("Error decoding ExtraNonce1.")
+		return err
+	}
+	poolLog.Debugf("en1 %v s.PoolWork.ExtraNonce1 %v", en1, s.PoolWork.ExtraNonce1)
+	// Work out padding
+	tmp := []string{"%0", strconv.Itoa(int(s.PoolWork.ExtraNonce2Length) * 2), "x"}
+	fmtString := strings.Join(tmp, "")
+	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
+	if err != nil {
+		poolLog.Error("Error decoding ExtraNonce2.")
+		return err
+	}
+	poolLog.Debugf("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
+	extraNonce := append(en1[:], en2[:]...)
+	poolLog.Debugf("extraNonce %v", extraNonce)
+
+	// Increase extranonce2
+	s.PoolWork.ExtraNonce2++
+
+	// Put coinbase transaction together
+
+	cb1, err := hex.DecodeString(s.PoolWork.CB1)
+	if err != nil {
+		poolLog.Error("Error decoding Coinbase pt 1.")
+		return err
+	}
+	poolLog.Debugf("cb1 %v s.PoolWork.CB1 %v", cb1, s.PoolWork.CB1)
+
+	// I've never actually seen a cb2.
+	cb2, err := hex.DecodeString(s.PoolWork.CB2)
+	if err != nil {
+		poolLog.Error("Error decoding Coinbase pt 2.")
+		return err
+	}
+	poolLog.Debugf("cb2 %v s.PoolWork.CB2 %v", cb2, s.PoolWork.CB2)
+
+	cb := append(cb1[:], extraNonce[:]...)
+	cb = append(cb[:], cb2[:]...)
+	poolLog.Debugf("cb %v", cb)
+
+	// Calculate merkle root
+	// I have never seen anything sent in the merkle tree
+	// sent by the pool so not much I can do here.
+	// Confirmed in ccminer code.
+	// Same for StakeRoot
+
+	// Generate current ntime
+	ntime := time.Now().Unix() + s.PoolWork.NtimeDelta
+
+	poolLog.Tracef("ntime: %v", ntime)
+
+	// Serialize header
+	bh := wire.BlockHeader{}
+	v, err := reverseToInt(s.PoolWork.Version)
+	if err != nil {
+		return err
+	}
+	bh.Version = v
+
+	nbits, err := hex.DecodeString(s.PoolWork.Nbits)
+	if err != nil {
+		poolLog.Error("Error decoding nbits")
+		return err
+	}
+
+	b, _ := binary.Uvarint(nbits)
+	bh.Bits = uint32(b)
+	t := time.Now().Unix() + s.PoolWork.NtimeDelta
+	bh.Timestamp = time.Unix(t, 0)
+	bh.Nonce = 0
+	// Serialized version
+	blockHeader, err := bh.Bytes()
+	if err != nil {
+		return err
+	}
+
+	target, err := hex.DecodeString(s.Target)
+	if err != nil {
+		poolLog.Error("Error decoding Target")
+		return err
+	}
+	if len(target) != 32 {
+		return fmt.Errorf("Wrong target length: got %d, expected 32", len(target))
+	}
+
+	data := blockHeader
+	poolLog.Debugf("data0 %v", data)
+	poolLog.Tracef("data len %v", len(data))
+	copy(data[31:139], cb1[0:108])
+	poolLog.Debugf("data1 %v", data)
+
+	var workdata [180]byte
+	workPosition := 0
+
+	version := new(bytes.Buffer)
+	err = binary.Write(version, binary.LittleEndian, v)
+	if err != nil {
+		return err
+	}
+	copy(workdata[workPosition:], version.Bytes())
+	poolLog.Debugf("appended version.Bytes() %v", version.Bytes())
+	poolLog.Tracef("partial workdata (version): %v", hex.EncodeToString(workdata[:]))
+
+	prevHash := revHash(s.PoolWork.Hash)
+	p, err := hex.DecodeString(prevHash)
+	if err != nil {
+		poolLog.Error("Error encoding previous hash.")
+		return err
+	}
+
+	workPosition += 4
+	copy(workdata[workPosition:], p)
+	poolLog.Tracef("partial workdata (previous hash): %v", hex.EncodeToString(workdata[:]))
+	poolLog.Debugf("prevHash %v", prevHash)
+
+	workPosition += 32
+	copy(workdata[workPosition:], cb1[0:108])
+	poolLog.Tracef("partial workdata (cb1): %v", hex.EncodeToString(workdata[:]))
+
+	workPosition += 108
+	copy(workdata[workPosition:], extraNonce)
+	poolLog.Debugf("extranonce: %v", hex.EncodeToString(extraNonce))
+	poolLog.Tracef("partial workdata (extranonce): %v", hex.EncodeToString(workdata[:]))
+
+	var randomBytes = make([]byte, 4)
+	_, err = rand.Read(randomBytes)
+	if err != nil {
+		poolLog.Errorf("Unable to generate random bytes")
+	}
+	workPosition += 4
+	copy(workdata[workPosition:], randomBytes)
+
+	poolLog.Debugf("workdata len %v", len(workdata))
+	poolLog.Tracef("workdata %v", hex.EncodeToString(workdata[:]))
+
+	var w Work
+	copy(w.Data[:], workdata[:])
+	copy(w.Target[:], target)
+	poolLog.Tracef("final data %v, target %v", hex.EncodeToString(data), hex.EncodeToString(target))
+	s.PoolWork.Work = &w
+	return nil
+
+}
+
+// PrepSubmit formats a mining.sumbit message from the solved work.
+func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
+	sub := Submit{}
+	sub.Method = "mining.submit"
+
+	// Format data to send off.
+
+	hexData := hex.EncodeToString(data)
+	decodedData, err := hex.DecodeString(hexData)
+	if err != nil {
+		poolLog.Error("Error decoding data.")
+		return sub, err
+	}
+
+	var submittedHeader wire.BlockHeader
+	bhBuf := bytes.NewReader(decodedData[0:wire.MaxBlockHeaderPayload])
+	err = submittedHeader.Deserialize(bhBuf)
+	if err != nil {
+		poolLog.Error("Error generating header.")
+		return sub, err
+	}
+
+	//en2 := strconv.FormatUint(s.PoolWork.ExtraNonce2, 16)
+	nonce := strconv.FormatUint(uint64(submittedHeader.Nonce), 16)
+	time := encodeTime(submittedHeader.Timestamp)
+
+	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
+	if err != nil {
+		poolLog.Error("Error decoding ExtraNonce1.")
+		//return err
+	}
+	poolLog.Tracef("en1 %v s.PoolWork.ExtraNonce1 %v", en1, s.PoolWork.ExtraNonce1)
+	// Work out padding
+	tmp := []string{"%0", strconv.Itoa(int(s.PoolWork.ExtraNonce2Length) * 2), "x"}
+	fmtString := strings.Join(tmp, "")
+	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
+	if err != nil {
+		poolLog.Error("Error decoding ExtraNonce2.")
+		//return err
+	}
+	poolLog.Tracef("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
+	extraNonce := append(en1[:], en2[:]...)
+	poolLog.Tracef("extraNonce %v", extraNonce)
+
+	s.ID++
+	sub.ID = s.ID
+	s.submitID = s.ID
+	s.submitted = true
+
+	poolLog.Tracef("ntime %v", s.PoolWork.Ntime)
+
+	poolLog.Tracef("raw User %v JobId %v xnonce2 %v xnonce2length %v time %v nonce %v", s.User, s.PoolWork.JobID, s.PoolWork.ExtraNonce2, s.PoolWork.ExtraNonce2Length, submittedHeader.Timestamp, submittedHeader.Nonce)
+
+	poolLog.Tracef("encoded User %v JobId %v xnonce2 %v time %v nonce %v", s.User, s.PoolWork.JobID, en2, string(time), nonce)
+
+	sub.Params = []string{s.User, s.PoolWork.JobID, hex.EncodeToString(en2), s.PoolWork.Ntime, nonce}
+	// pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr
+
+	return sub, nil
+}
+
+// Various helper functions for formatting are below.
+
+func encodeTime(t time.Time) []byte {
+	buf := make([]byte, 8)
+	u := uint64(t.Unix())
+	binary.BigEndian.PutUint64(buf, u)
+	return buf
+}
+
+func reverseS(s string) (string, error) {
+	a := strings.Split(s, "")
+	sRev := ""
+	if len(a)%2 != 0 {
+		return "", fmt.Errorf("Incorrect input length")
+	}
+	for i := 0; i < len(a); i += 2 {
+		tmp := []string{a[i], a[i+1], sRev}
+		sRev = strings.Join(tmp, "")
+	}
+	return sRev, nil
+}
+
+func reverseToInt(s string) (int32, error) {
+	sRev, err := reverseS(s)
+	if err != nil {
+		return 0, err
+	}
+	i, err := strconv.ParseInt(sRev, 10, 32)
+	return int32(i), err
+}
+
+func (s *Stratum) diffToTarget(diff float64) string {
+	// diff/0 would be bad.
+	if s.Diff == 0 {
+		s.Diff = 1
+	}
+	// Also if diff wasn't set properly go with default
+	// rather then end if div by 0.
+	if diff == 0 {
+		diff = 1
+	}
+	diffNew := int64(diff / s.Diff)
+	_, targetHex := s.getTargetHex(diffNew)
+	return targetHex
+}
+
+// Adapted from https://github.com/sammy007/go-cryptonote-pool.git
+func (s *Stratum) getTargetHex(diff int64) (uint32, string) {
+	var Diff1 *big.Int
+	Diff1 = new(big.Int)
+	Diff1.SetString("00000000FFFF0000000000000000000000000000000000000000000000000000", 16)
+
+	padded := make([]byte, 32)
+
+	diff2 := new(big.Int)
+	diff2.SetInt64(int64(diff))
+
+	diff3 := new(big.Int)
+	diff3 = diff3.Div(Diff1, diff2)
+
+	diffBuff := diff3.Bytes()
+	copy(padded[32-len(diffBuff):], diffBuff)
+	buff := padded[0:32]
+	var target uint32
+	targetBuff := bytes.NewReader(buff)
+	binary.Read(targetBuff, binary.LittleEndian, &target)
+	targetHex := hex.EncodeToString(buff)
+
+	return target, targetHex
+}
+
+func reverse(src []byte) []byte {
+	dst := make([]byte, len(src))
+	for i := len(src); i > 0; i-- {
+		dst[len(src)-i] = src[i-1]
+	}
+	return dst
+}
+
+func revHash(hash string) string {
+	revHash := ""
+	for i := 0; i < 7; i++ {
+		j := i * 8
+		part := fmt.Sprintf("%c%c%c%c%c%c%c%c", hash[6+j], hash[7+j], hash[4+j], hash[5+j], hash[2+j], hash[3+j], hash[0+j], hash[1+j])
+		revHash += part
+	}
+	return revHash
+
+}

From c755363c50df6b9dad0fc44937abcf58166b1194 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 8 Jul 2016 09:05:23 -0400
Subject: [PATCH 008/150] Add -k option to specify cl kernel location.

---
 config.go | 3 +++
 device.go | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/config.go b/config.go
index f17dd04..95b56a4 100644
--- a/config.go
+++ b/config.go
@@ -20,6 +20,7 @@ const (
 	defaultLogLevel       = "info"
 	defaultLogDirname     = "logs"
 	defaultLogFilename    = "gominer.log"
+	defaultClKernel       = "blake256.cl"
 )
 
 var (
@@ -42,6 +43,7 @@ type config struct {
 	ConfigFile string `short:"C" long:"configfile" description:"Path to configuration file"`
 	LogDir     string `long:"logdir" description:"Directory to log output."`
 	DebugLevel string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`
+	ClKernel   string `short:"k" long:"kernel" description:"File with cl kernel to use"`
 
 	// Debugging options
 	Profile    string `long:"profile" description:"Enable HTTP profiling on given port -- NOTE port must be between 1024 and 65536"`
@@ -221,6 +223,7 @@ func loadConfig() (*config, []string, error) {
 		RPCServer:  defaultRPCServer,
 		RPCCert:    defaultRPCCertFile,
 		Intensity:  defaultIntensity,
+		ClKernel:   defaultClKernel,
 	}
 
 	// Create the home directory if it doesn't already exist.
diff --git a/device.go b/device.go
index 9a22730..57aa0de 100644
--- a/device.go
+++ b/device.go
@@ -134,7 +134,7 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 	}
 
 	// Load kernel source
-	progSrc, progSize, err := loadProgramSource("blake256.cl")
+	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
 		return nil, fmt.Errorf("Could not load kernel source: %v", err)
 	}

From 755ae08369a410dae6dc34511e100a656c47bdb4 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Wed, 13 Jul 2016 10:33:55 -0500
Subject: [PATCH 009/150] reverse diff/hash so new target checking code works
 when solo mining (#17)

---
 device.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/device.go b/device.go
index 57aa0de..f39392d 100644
--- a/device.go
+++ b/device.go
@@ -329,15 +329,15 @@ func (d *Device) foundCandidate(nonce1 uint32, nonce0 uint32) {
 		binary.BigEndian.PutUint32(hash[i*4:], state[i])
 	}
 
-	newHash, err := chainhash.NewHashFromStr(hex.EncodeToString(hash[:]))
+	newHash, err := chainhash.NewHashFromStr(hex.EncodeToString(reverse(hash[:])))
 	if err != nil {
 		minrLog.Error(err)
 	}
 	hashNum := blockchain.ShaHashToBig(newHash)
 	target := new(big.Int)
-	target.SetString(hex.EncodeToString(d.work.Target[:]), 16)
+	target.SetString(hex.EncodeToString(reverse(d.work.Target[:])), 16)
 	if hashNum.Cmp(target) > 0 {
-		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(hash[:]), hex.EncodeToString(d.work.Target[:]))
+		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(reverse(hash[:])), hex.EncodeToString(reverse(d.work.Target[:])))
 
 	} else {
 		minrLog.Infof("Found hash!!  %s", hex.EncodeToString(hash[:]))

From 7931e7d31646928a992bafe85a6b05df7ba059ce Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 12 Jul 2016 16:05:52 -0400
Subject: [PATCH 010/150] Add updated blake256 cl code from cgminer.

Minor modifications of the return format was needed to work with
gominer.

Keep old blake256 kernel as it performs better on intel hardware but
not others.

The improvement is mainly for nvidia cards.  AMD cards seem to have
little difference.  For testing with Nvidia 750 ti it went from
~440Mh/s to ~520Mh/s.
---
 blake256-old.cl | 1747 ++++++++++++++++++++++++++++++++++++++++++++++
 blake256.cl     | 1751 +++--------------------------------------------
 2 files changed, 1830 insertions(+), 1668 deletions(-)
 create mode 100644 blake256-old.cl

diff --git a/blake256-old.cl b/blake256-old.cl
new file mode 100644
index 0000000..ed65d90
--- /dev/null
+++ b/blake256-old.cl
@@ -0,0 +1,1747 @@
+/*    /\\ //\            BLAKE256 14-round kernel            /\\ //\    */
+/*    \// \\/          Copyright 2015  Company Zero          \// \\/    */
+/*    /\\ //\           A complete kernel re-write           /\\ //\    */
+/*    \// \\/           with inspiration  from the           \// \\/    */
+/*    /\\ //\          Golang BLAKE256 repo over at          /\\ //\    */
+/*    \// \\/           github.com/dchest/blake256           \// \\/    */
+
+#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
+
+__constant uint cst0 = 0x243F6A88UL;
+__constant uint cst1 = 0x85A308D3UL;
+__constant uint cst2 = 0x13198A2EUL;
+__constant uint cst3 = 0x03707344UL;
+__constant uint cst4 = 0xA4093822UL;
+__constant uint cst5 = 0x299F31D0UL;
+__constant uint cst6 = 0x082EFA98UL;
+__constant uint cst7 = 0xEC4E6C89UL;
+__constant uint cst8 = 0x452821E6UL;
+__constant uint cst9 = 0x38D01377UL;
+__constant uint cstA = 0xBE5466CFUL;
+__constant uint cstB = 0x34E90C6CUL;
+__constant uint cstC = 0xC0AC29B7UL;
+__constant uint cstD = 0xC97C50DDUL;
+__constant uint cstE = 0x3F84D5B5UL;
+__constant uint cstF = 0xB5470917UL;
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	volatile __global uint * restrict output,
+	// Midstate
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+
+	// last 52 bytes of original message
+	const uint in32,               // M[0]
+	const uint in33,               // M[1]
+	const uint in34,               // M[2]
+	// const uint in35, = nonce       M[3]
+
+	const uint in36,               // M[4]
+	const uint in37,               // M[5]
+	const uint in38,               // M[6]
+	const uint in39,               // M[7]
+
+	const uint in40,               // M[8]
+	const uint in41,               // M[9]
+	const uint in42,               // M[10]
+	const uint in43,               // M[11]
+
+	const uint in44                // M[12]
+	// in45 = padding                 M[13]
+	// in46 = padding                 M[14]
+	// in47 = padding                 M[15]
+)
+{
+	uint M0, M1, M2, M3, M4, M5, M6, M7;
+	uint M8, M9, MA, MB, MC, MD, ME, MF;
+	uint V0, V1, V2, V3, V4, V5, V6, V7;
+	uint V8, V9, VA, VB, VC, VD, VE, VF;
+	uint pre7;
+
+	/* Load the midstate and initialize */
+	V0 = h0;
+	V1 = h1;
+	V2 = h2;
+	V3 = h3;
+	V4 = h4;
+	V5 = h5;
+	V6 = h6;
+	pre7 = V7 = h7;
+	V8 = cst0;
+	V9 = cst1;
+	VA = cst2;
+	VB = cst3;
+	VC = 0xA4093D82UL;
+	VD = 0x299F3470UL;
+	VE = cst6;
+	VF = cst7;
+
+	uint nonce = get_global_id(0);
+
+	/* Load the block header and padding */
+	M0 = in32;
+	M1 = in33;
+	M2 = in34;
+	M3 = nonce;
+	M4 = in36;
+	M5 = in37;
+	M6 = in38;
+	M7 = in39;
+	M8 = in40;
+	M9 = in41;
+	MA = in42;
+	MB = in43;
+	MC = in44;
+	MD = 0x80000001UL;
+	ME = 0x00000000UL;
+	MF = 0x000005a0UL;
+
+	/* Begin the doing the 64-byte block.
+	 * This can probably be optimized to
+	 * get another 10-15% performance out.
+	*/
+
+	/* Round 1. */
+	V0 = V0 + (M0 ^ cst1);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M2 ^ cst3);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M4 ^ cst5);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M6 ^ cst7);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M5 ^ cst4);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M7 ^ cst6);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M3 ^ cst2);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M1 ^ cst0);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M8 ^ cst9);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (MA ^ cstB);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (MC ^ cstD);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (ME ^ cstF);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (MD ^ cstC);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (MF ^ cstE);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (MB ^ cstA);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M9 ^ cst8);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 2. */
+	V0 = V0 + (ME ^ cstA);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M4 ^ cst8);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M9 ^ cstF);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MD ^ cst6);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MF ^ cst9);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M6 ^ cstD);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M8 ^ cst4);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (MA ^ cstE);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M1 ^ cstC);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M0 ^ cst2);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (MB ^ cst7);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M5 ^ cst3);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M7 ^ cstB);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M3 ^ cst5);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M2 ^ cst0);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (MC ^ cst1);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 3. */
+	V0 = V0 + (MB ^ cst8);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (MC ^ cst0);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M5 ^ cst2);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MF ^ cstD);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M2 ^ cst5);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (MD ^ cstF);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M0 ^ cstC);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M8 ^ cstB);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (MA ^ cstE);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M3 ^ cst6);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M7 ^ cst1);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M9 ^ cst4);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M1 ^ cst7);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M4 ^ cst9);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M6 ^ cst3);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (ME ^ cstA);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 4. */
+	V0 = V0 + (M7 ^ cst9);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M3 ^ cst1);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (MD ^ cstC);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MB ^ cstE);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MC ^ cstD);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (ME ^ cstB);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M1 ^ cst3);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M9 ^ cst7);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M2 ^ cst6);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M5 ^ cstA);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M4 ^ cst0);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (MF ^ cst8);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M0 ^ cst4);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M8 ^ cstF);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (MA ^ cst5);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M6 ^ cst2);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 5. */
+	V0 = V0 + (M9 ^ cst0);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M5 ^ cst7);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M2 ^ cst4);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MA ^ cstF);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M4 ^ cst2);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (MF ^ cstA);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M7 ^ cst5);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M0 ^ cst9);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (ME ^ cst1);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (MB ^ cstC);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M6 ^ cst8);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M3 ^ cstD);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M8 ^ cst6);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (MD ^ cst3);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (MC ^ cstB);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M1 ^ cstE);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 6. */
+	V0 = V0 + (M2 ^ cstC);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M6 ^ cstA);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M0 ^ cstB);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M8 ^ cst3);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MB ^ cst0);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M3 ^ cst8);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (MA ^ cst6);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (MC ^ cst2);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M4 ^ cstD);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M7 ^ cst5);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (MF ^ cstE);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M1 ^ cst9);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (ME ^ cstF);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M9 ^ cst1);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M5 ^ cst7);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (MD ^ cst4);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 7. */
+	V0 = V0 + (MC ^ cst5);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M1 ^ cstF);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (ME ^ cstD);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M4 ^ cstA);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MD ^ cstE);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (MA ^ cst4);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (MF ^ cst1);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M5 ^ cstC);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M0 ^ cst7);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M6 ^ cst3);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M9 ^ cst2);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M8 ^ cstB);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M2 ^ cst9);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (MB ^ cst8);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M3 ^ cst6);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M7 ^ cst0);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 8. */
+	V0 = V0 + (MD ^ cstB);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M7 ^ cstE);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (MC ^ cst1);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M3 ^ cst9);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M1 ^ cstC);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M9 ^ cst3);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (ME ^ cst7);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (MB ^ cstD);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M5 ^ cst0);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (MF ^ cst4);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M8 ^ cst6);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M2 ^ cstA);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M6 ^ cst8);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (MA ^ cst2);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M4 ^ cstF);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M0 ^ cst5);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 9. */
+	V0 = V0 + (M6 ^ cstF);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (ME ^ cst9);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (MB ^ cst3);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M0 ^ cst8);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M3 ^ cstB);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M8 ^ cst0);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M9 ^ cstE);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (MF ^ cst6);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (MC ^ cst2);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (MD ^ cst7);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M1 ^ cst4);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (MA ^ cst5);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M4 ^ cst1);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M5 ^ cstA);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M7 ^ cstD);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M2 ^ cstC);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 10. */
+	V0 = V0 + (MA ^ cst2);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M8 ^ cst4);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M7 ^ cst6);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M1 ^ cst5);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M6 ^ cst7);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M5 ^ cst1);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M4 ^ cst8);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M2 ^ cstA);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (MF ^ cstB);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M9 ^ cstE);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M3 ^ cstC);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (MD ^ cst0);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (MC ^ cst3);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M0 ^ cstD);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (ME ^ cst9);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (MB ^ cstF);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 11. */
+	V0 = V0 + (M0 ^ cst1);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M2 ^ cst3);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M4 ^ cst5);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (M6 ^ cst7);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M5 ^ cst4);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M7 ^ cst6);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M3 ^ cst2);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M1 ^ cst0);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M8 ^ cst9);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (MA ^ cstB);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (MC ^ cstD);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (ME ^ cstF);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (MD ^ cstC);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (MF ^ cstE);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (MB ^ cstA);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M9 ^ cst8);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 12. */
+	V0 = V0 + (ME ^ cstA);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M4 ^ cst8);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M9 ^ cstF);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MD ^ cst6);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MF ^ cst9);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (M6 ^ cstD);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M8 ^ cst4);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (MA ^ cstE);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M1 ^ cstC);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M0 ^ cst2);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (MB ^ cst7);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M5 ^ cst3);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M7 ^ cstB);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M3 ^ cst5);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M2 ^ cst0);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (MC ^ cst1);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 13. */
+	V0 = V0 + (MB ^ cst8);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (MC ^ cst0);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (M5 ^ cst2);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MF ^ cstD);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (M2 ^ cst5);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (MD ^ cstF);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M0 ^ cstC);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M8 ^ cstB);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (MA ^ cstE);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M3 ^ cst6);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M7 ^ cst1);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (M9 ^ cst4);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M1 ^ cst7);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M4 ^ cst9);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (M6 ^ cst3);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (ME ^ cstA);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* Round 14. */
+	V0 = V0 + (M7 ^ cst9);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 16);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 12);
+	V1 = V1 + (M3 ^ cst1);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 16);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 12);
+	V2 = V2 + (MD ^ cstC);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 16);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 12);
+	V3 = V3 + (MB ^ cstE);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 16);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 12);
+	V2 = V2 + (MC ^ cstD);
+	V2 = V2 + V6;
+	VE = VE ^ V2;
+	VE = SPH_ROTR32(VE, 8);
+	VA = VA + VE;
+	V6 = V6 ^ VA;
+	V6 = SPH_ROTR32(V6, 7);
+	V3 = V3 + (ME ^ cstB);
+	V3 = V3 + V7;
+	VF = VF ^ V3;
+	VF = SPH_ROTR32(VF, 8);
+	VB = VB + VF;
+	V7 = V7 ^ VB;
+	V7 = SPH_ROTR32(V7, 7);
+	V1 = V1 + (M1 ^ cst3);
+	V1 = V1 + V5;
+	VD = VD ^ V1;
+	VD = SPH_ROTR32(VD, 8);
+	V9 = V9 + VD;
+	V5 = V5 ^ V9;
+	V5 = SPH_ROTR32(V5, 7);
+	V0 = V0 + (M9 ^ cst7);
+	V0 = V0 + V4;
+	VC = VC ^ V0;
+	VC = SPH_ROTR32(VC, 8);
+	V8 = V8 + VC;
+	V4 = V4 ^ V8;
+	V4 = SPH_ROTR32(V4, 7);
+	V0 = V0 + (M2 ^ cst6);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 16);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 12);
+	V1 = V1 + (M5 ^ cstA);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 16);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 12);
+	V2 = V2 + (M4 ^ cst0);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 16);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 12);
+	V3 = V3 + (MF ^ cst8);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 16);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 12);
+	V2 = V2 + (M0 ^ cst4);
+	V2 = V2 + V7;
+	VD = VD ^ V2;
+	VD = SPH_ROTR32(VD, 8);
+	V8 = V8 + VD;
+	V7 = V7 ^ V8;
+	V7 = SPH_ROTR32(V7, 7);
+	V3 = V3 + (M8 ^ cstF);
+	V3 = V3 + V4;
+	VE = VE ^ V3;
+	VE = SPH_ROTR32(VE, 8);
+	V9 = V9 + VE;
+	V4 = V4 ^ V9;
+	V4 = SPH_ROTR32(V4, 7);
+	V1 = V1 + (MA ^ cst5);
+	V1 = V1 + V6;
+	VC = VC ^ V1;
+	VC = SPH_ROTR32(VC, 8);
+	VB = VB + VC;
+	V6 = V6 ^ VB;
+	V6 = SPH_ROTR32(V6, 7);
+	V0 = V0 + (M6 ^ cst2);
+	V0 = V0 + V5;
+	VF = VF ^ V0;
+	VF = SPH_ROTR32(VF, 8);
+	VA = VA + VF;
+	V5 = V5 ^ VA;
+	V5 = SPH_ROTR32(V5, 7);
+
+	/* The final chunks of the hash
+	 * are calculated as:
+	 * h0 = h0 ^ V0 ^ V8;
+	 * h1 = h1 ^ V1 ^ V9;
+	 * h2 = h2 ^ V2 ^ VA;
+	 * h3 = h3 ^ V3 ^ VB;
+	 * h4 = h4 ^ V4 ^ VC;
+	 * h5 = h5 ^ V5 ^ VD;
+	 * h6 = h6 ^ V6 ^ VE;
+	 * h7 = h7 ^ V7 ^ VF;
+	 *
+	 * We just check if the last byte
+	 * is zeroed and if it is, we tell
+	 * cgminer that we've found a
+	 * and to check it against the
+	 * target.
+	*/
+
+	/* Debug code to help you assess the correctness
+	 * of your hashing function in case someone decides
+	 * to try to optimize.
+	if (!((pre7 ^ V7 ^ VF) & 0xFFFF0000)) {
+		printf("hash on gpu %x %x %x %x %x %x %x %x\n",
+			h0 ^ V0 ^ V8,
+			h1 ^ V1 ^ V9,
+			h2 ^ V2 ^ VA,
+			h3 ^ V3 ^ VB,
+			h4 ^ V4 ^ VC,
+			h5 ^ V5 ^ VD,
+			h6 ^ V6 ^ VE,
+			h7 ^ V7 ^ VF);
+		printf("nonce for hash on gpu %x\n",
+			nonce);
+	}
+	*/
+
+	if (pre7 ^ V7 ^ VF) return;
+
+	/* Push this share */
+	output[++output[0]] = nonce;
+}
diff --git a/blake256.cl b/blake256.cl
index ed65d90..0b3c7d8 100644
--- a/blake256.cl
+++ b/blake256.cl
@@ -1,28 +1,30 @@
-/*    /\\ //\            BLAKE256 14-round kernel            /\\ //\    */
-/*    \// \\/          Copyright 2015  Company Zero          \// \\/    */
-/*    /\\ //\           A complete kernel re-write           /\\ //\    */
-/*    \// \\/           with inspiration  from the           \// \\/    */
-/*    /\\ //\          Golang BLAKE256 repo over at          /\\ //\    */
-/*    \// \\/           github.com/dchest/blake256           \// \\/    */
-
-#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
-
-__constant uint cst0 = 0x243F6A88UL;
-__constant uint cst1 = 0x85A308D3UL;
-__constant uint cst2 = 0x13198A2EUL;
-__constant uint cst3 = 0x03707344UL;
-__constant uint cst4 = 0xA4093822UL;
-__constant uint cst5 = 0x299F31D0UL;
-__constant uint cst6 = 0x082EFA98UL;
-__constant uint cst7 = 0xEC4E6C89UL;
-__constant uint cst8 = 0x452821E6UL;
-__constant uint cst9 = 0x38D01377UL;
-__constant uint cstA = 0xBE5466CFUL;
-__constant uint cstB = 0x34E90C6CUL;
-__constant uint cstC = 0xC0AC29B7UL;
-__constant uint cstD = 0xC97C50DDUL;
-__constant uint cstE = 0x3F84D5B5UL;
-__constant uint cstF = 0xB5470917UL;
+/**
+ * BLAKE256 14-round kernel
+ *
+ * Copyright 2015 Company Zero
+ * A complete kernel re-write
+ * with inspiration from the Golang BLAKE256 repo (github.com/dchest/blake256)
+ */
+
+/**
+ * optimized by tpruvot 02/2016 :
+ *
+ * GTX 960 | (5s):735.3M (avg):789.3Mh/s
+ * GTX 750 | (5s):443.3M (avg):476.8Mh/s
+ * to
+ * GTX 960 | (5s):875.0M (avg):899.2Mh/s
+ * GTX 750 | (5s):523.1M (avg):536.8Mh/s
+ */
+#define ROTR(v,n) rotate(v,(uint)(32U-n))
+#define ROTL(v,n) rotate(v, n)
+
+#ifdef _AMD_OPENCL
+#define SWAP(v)   rotate(v, 16U)
+#define ROTR8(v)  rotate(v, 24U)
+#else
+#define SWAP(v)  as_uint(as_uchar4(v).zwxy)
+#define ROTR8(v) as_uint(as_uchar4(v).yzwx)
+#endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void search(
@@ -37,30 +39,45 @@ __kernel void search(
 	const uint h6,
 	const uint h7,
 
-	// last 52 bytes of original message
-	const uint in32,               // M[0]
-	const uint in33,               // M[1]
-	const uint in34,               // M[2]
-	// const uint in35, = nonce       M[3]
-
-	const uint in36,               // M[4]
-	const uint in37,               // M[5]
-	const uint in38,               // M[6]
-	const uint in39,               // M[7]
-
-	const uint in40,               // M[8]
-	const uint in41,               // M[9]
-	const uint in42,               // M[10]
-	const uint in43,               // M[11]
-
-	const uint in44                // M[12]
-	// in45 = padding                 M[13]
-	// in46 = padding                 M[14]
-	// in47 = padding                 M[15]
+	// last 52 bytes of data
+	const uint M0,
+	const uint M1,
+	const uint M2,
+	// const uint M3 : nonce
+	const uint M4,
+	const uint M5,
+	const uint M6,
+	const uint M7,
+	const uint M8,
+	const uint M9,
+	const uint MA,
+	const uint MB,
+	const uint MC
 )
 {
-	uint M0, M1, M2, M3, M4, M5, M6, M7;
-	uint M8, M9, MA, MB, MC, MD, ME, MF;
+	/* Load the block header and padding */
+	const uint M3 = get_global_id(0);
+	const uint MD = 0x80000001UL;
+	const uint ME = 0x00000000UL;
+	const uint MF = 0x000005a0UL;
+
+	const uint cst0 = 0x243F6A88UL;
+	const uint cst1 = 0x85A308D3UL;
+	const uint cst2 = 0x13198A2EUL;
+	const uint cst3 = 0x03707344UL;
+	const uint cst4 = 0xA4093822UL;
+	const uint cst5 = 0x299F31D0UL;
+	const uint cst6 = 0x082EFA98UL;
+	const uint cst7 = 0xEC4E6C89UL;
+	const uint cst8 = 0x452821E6UL;
+	const uint cst9 = 0x38D01377UL;
+	const uint cstA = 0xBE5466CFUL;
+	const uint cstB = 0x34E90C6CUL;
+	const uint cstC = 0xC0AC29B7UL;
+	const uint cstD = 0xC97C50DDUL;
+	const uint cstE = 0x3F84D5B5UL;
+	const uint cstF = 0xB5470917UL;
+
 	uint V0, V1, V2, V3, V4, V5, V6, V7;
 	uint V8, V9, VA, VB, VC, VD, VE, VF;
 	uint pre7;
@@ -74,6 +91,7 @@ __kernel void search(
 	V5 = h5;
 	V6 = h6;
 	pre7 = V7 = h7;
+
 	V8 = cst0;
 	V9 = cst1;
 	VA = cst2;
@@ -83,1626 +101,22 @@ __kernel void search(
 	VE = cst6;
 	VF = cst7;
 
-	uint nonce = get_global_id(0);
-
-	/* Load the block header and padding */
-	M0 = in32;
-	M1 = in33;
-	M2 = in34;
-	M3 = nonce;
-	M4 = in36;
-	M5 = in37;
-	M6 = in38;
-	M7 = in39;
-	M8 = in40;
-	M9 = in41;
-	MA = in42;
-	MB = in43;
-	MC = in44;
-	MD = 0x80000001UL;
-	ME = 0x00000000UL;
-	MF = 0x000005a0UL;
-
-	/* Begin the doing the 64-byte block.
-	 * This can probably be optimized to
-	 * get another 10-15% performance out.
-	*/
-
-	/* Round 1. */
-	V0 = V0 + (M0 ^ cst1);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M2 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M4 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M6 ^ cst7);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M5 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M7 ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M3 ^ cst2);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M1 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M8 ^ cst9);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MA ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (ME ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MF ^ cstE);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MB ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M9 ^ cst8);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 2. */
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M9 ^ cstF);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MD ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MF ^ cst9);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M6 ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M1 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M0 ^ cst2);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MB ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M5 ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M7 ^ cstB);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M3 ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M2 ^ cst0);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MC ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 3. */
-	V0 = V0 + (MB ^ cst8);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (MC ^ cst0);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M5 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MF ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M2 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MD ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M0 ^ cstC);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M8 ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M7 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M9 ^ cst4);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M1 ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M4 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 4. */
-	V0 = V0 + (M7 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M3 ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MB ^ cstE);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (ME ^ cstB);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M1 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M9 ^ cst7);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M2 ^ cst6);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M5 ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M4 ^ cst0);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MF ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M0 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M8 ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MA ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M6 ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 5. */
-	V0 = V0 + (M9 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M5 ^ cst7);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M2 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MA ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M4 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MF ^ cstA);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M7 ^ cst5);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M0 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (ME ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MB ^ cstC);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M6 ^ cst8);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M3 ^ cstD);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M8 ^ cst6);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MD ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MC ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M1 ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 6. */
-	V0 = V0 + (M2 ^ cstC);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M6 ^ cstA);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M0 ^ cstB);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M8 ^ cst3);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MB ^ cst0);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M3 ^ cst8);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (MA ^ cst6);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MC ^ cst2);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M4 ^ cstD);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M7 ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MF ^ cstE);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M1 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (ME ^ cstF);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M9 ^ cst1);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M5 ^ cst7);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MD ^ cst4);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 7. */
-	V0 = V0 + (MC ^ cst5);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M1 ^ cstF);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (ME ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M4 ^ cstA);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MD ^ cstE);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MA ^ cst4);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (MF ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M5 ^ cstC);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M0 ^ cst7);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M9 ^ cst2);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M8 ^ cstB);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M2 ^ cst9);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MB ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M7 ^ cst0);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 8. */
-	V0 = V0 + (MD ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M7 ^ cstE);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MC ^ cst1);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M3 ^ cst9);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M1 ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M9 ^ cst3);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (ME ^ cst7);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MB ^ cstD);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M5 ^ cst0);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MF ^ cst4);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M8 ^ cst6);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M2 ^ cstA);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M6 ^ cst8);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MA ^ cst2);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M4 ^ cstF);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M0 ^ cst5);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 9. */
-	V0 = V0 + (M6 ^ cstF);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (ME ^ cst9);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MB ^ cst3);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M0 ^ cst8);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M3 ^ cstB);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M8 ^ cst0);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M9 ^ cstE);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MF ^ cst6);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MC ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MD ^ cst7);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M1 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MA ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M4 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M5 ^ cstA);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M7 ^ cstD);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M2 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 10. */
-	V0 = V0 + (MA ^ cst2);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M7 ^ cst6);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M1 ^ cst5);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M6 ^ cst7);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M5 ^ cst1);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M2 ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MF ^ cstB);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M9 ^ cstE);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M3 ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MD ^ cst0);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MC ^ cst3);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M0 ^ cstD);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (ME ^ cst9);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MB ^ cstF);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 11. */
-	V0 = V0 + (M0 ^ cst1);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M2 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M4 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M6 ^ cst7);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M5 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M7 ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M3 ^ cst2);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M1 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M8 ^ cst9);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MA ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (ME ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MF ^ cstE);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MB ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M9 ^ cst8);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 12. */
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M9 ^ cstF);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MD ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MF ^ cst9);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M6 ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M1 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M0 ^ cst2);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MB ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M5 ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M7 ^ cstB);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M3 ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M2 ^ cst0);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MC ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 13. */
-	V0 = V0 + (MB ^ cst8);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (MC ^ cst0);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M5 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MF ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M2 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MD ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M0 ^ cstC);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M8 ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M7 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M9 ^ cst4);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M1 ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M4 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 14. */
-	V0 = V0 + (M7 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M3 ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MB ^ cstE);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (ME ^ cstB);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M1 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M9 ^ cst7);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M2 ^ cst6);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M5 ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M4 ^ cst0);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MF ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M0 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M8 ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MA ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M6 ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
+	/* 14 rounds */
+
+	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M9 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M2 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MA ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M4 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MF ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M0 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (ME ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MB ^ cstC); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M3 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MD ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MC ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M1 ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M2 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M6 ^ cstA); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M0 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M8 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MB ^ cst0); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M3 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MA ^ cst6); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M4 ^ cstD); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MF ^ cstE); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M1 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (ME ^ cstF); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M9 ^ cst1); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MD ^ cst4); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MC ^ cst5); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M1 ^ cstF); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (ME ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M4 ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MD ^ cstE); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MA ^ cst4); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MF ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M5 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M0 ^ cst7); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M9 ^ cst2); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M8 ^ cstB); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M2 ^ cst9); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MB ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M7 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MD ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M7 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MC ^ cst1); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M3 ^ cst9); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M1 ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M9 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (ME ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MB ^ cstD); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M5 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MF ^ cst4); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M2 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MA ^ cst2); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M4 ^ cstF); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M0 ^ cst5); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M6 ^ cstF); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (ME ^ cst9); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MB ^ cst3); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M0 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M3 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M8 ^ cst0); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MF ^ cst6); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MD ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M1 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MA ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M4 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M5 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M7 ^ cstD); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M2 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MA ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M7 ^ cst6); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M1 ^ cst5); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M6 ^ cst7); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M5 ^ cst1); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M2 ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MF ^ cstB); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M3 ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MD ^ cst0); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MC ^ cst3); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M0 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (ME ^ cst9); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MB ^ cstF); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
+	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF);/*VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);*/
 
 	/* The final chunks of the hash
 	 * are calculated as:
@@ -1743,5 +157,6 @@ __kernel void search(
 	if (pre7 ^ V7 ^ VF) return;
 
 	/* Push this share */
-	output[++output[0]] = nonce;
+	//output[output[0xFF]++] = M3;
+	output[++output[0]] = M3;
 }

From 236c1b3bfdb9146df2f24c9c9964ec5766e1e456 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 19 Jul 2016 06:53:06 -0500
Subject: [PATCH 011/150] improve compatibility with sgminer and send share
 acceptance messages (#18)

---
 notify/notify.go | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/notify/notify.go b/notify/notify.go
index 2cfc6d6..f0ca7df 100644
--- a/notify/notify.go
+++ b/notify/notify.go
@@ -8,8 +8,10 @@
 package main
 
 import (
+	"bufio"
 	"fmt"
 	"net"
+	"strings"
 )
 
 func main() {
@@ -28,25 +30,35 @@ func main() {
 }
 
 func handleConnection(c net.Conn) {
-	msg1 := `{"id":1,"result":[[["mining.set_difficulty","deadbeefcafebabecc7e1c0000000000"],["mining.notify","deadbeefcafebabecc7e1c0000000000"]],"00000000000000000fe43fbb",12],"error":null}`
-	msg2 := `{"id":null,"method":"mining.set_difficulty","params":[8]}`
-	msg3 := `{"id":null,"method":"mining.notify","params":["bb3b","6ea8e28a4b172946d743eb3382785120990fe73c247e3dbd000004fc00000000","b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca76570000000000000000","",[],"01000000","1a17f8d9","5776cadb",true]}`
+	msg1 := `{"id":1,"result":[[["mining.set_difficulty","1"],["mining.notify","2bd595e34826a3b6271400920d4decb8"]],"0000000000000000e3014335",12],"error":null}`
+	msg2 := `{"id":2,"result":true,"error":null}`
+	msg3 := `{"id":null,"method":"mining.set_difficulty","params":[1]}`
+	msg4 := `{"id":3,"result":true,"error":null}`
+	msg5 := `{"id":null,"method":"mining.notify","params":["76df","7c3b9a506a98f865820e4c46aaa65cec37f18cf1bf7c508700000ac200000000","a455f69725e9c8623baa3c9c5a708aefb947702dc2b620b4c10129977e104c0275571a5ca5b1308b075fe74224504c9e6b1153f3de97235e7a8c7e58ea8f1c55010086a1d41fb3ee05000000fda400004a33121a2db33e1101000000abae0000260800008ec783570000000000000000","",[],"01000000","1a12334a","5783c78e",true]}`
 	// WorkData generated from that should be:
-	// 010000008ae2a86e4629174b33eb43d7205178823ce70f99bd3d7e24fc04000000000000b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca7657000000000000000000188fec0fe43fbb000000000000000000000000000000000000000000000000
+	// 010000008ae2a86e4629174b33eb43d7205178823ce70f99bd3d7e24fc04000000000000b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca7657000000000000000000f808120fe43fbb000000000000000000000000000000000000000000000000
+	msg6 := `{"id":4,"result":true,"error":null}`
 
-	buf := make([]byte, 1024)
-	_, err := c.Read(buf)
-	if err != nil {
-		fmt.Println("Error reading:", err.Error())
-	}
+	reader := bufio.NewReader(c)
 
-	fmt.Println(string(buf))
-
-	send("subscribe reply", []byte(msg1), c)
-	send("difficulty", []byte(msg2), c)
-	send("notify", []byte(msg3), c)
+	for {
+		buf, err := reader.ReadBytes('\n')
+		if err != nil {
+			c.Close()
+			return
+		}
+		fmt.Println("Received " + string(buf))
 
-	//c.Close()
+		if strings.Contains(string(buf), "mining.submit") {
+			send("mining.submit reply", []byte(msg6), c)
+		} else {
+			send("subscribe reply", []byte(msg1), c)
+			send("authorize reply", []byte(msg2), c)
+			send("difficulty", []byte(msg3), c)
+			send("mining.extranonce.subscribe", []byte(msg4), c)
+			send("notify", []byte(msg5), c)
+		}
+	}
 }
 
 func send(mType string, m []byte, c net.Conn) {

From 398d325212009cf2d16b45d67fdd1a2c38694eb0 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 19 Jul 2016 15:19:26 -0400
Subject: [PATCH 012/150] Count correctly when creating lastBlock

---
 device.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/device.go b/device.go
index f39392d..0d63402 100644
--- a/device.go
+++ b/device.go
@@ -221,7 +221,7 @@ func (d *Device) updateCurrentWork() {
 
 	// Convert the next block to uint32 array.
 	for i := 0; i < 16; i++ {
-		d.lastBlock[i] = binary.BigEndian.Uint32(d.work.Data[128+i*4:])
+		d.lastBlock[i] = binary.BigEndian.Uint32(d.work.Data[128+i*4 : 132+i*4])
 	}
 }
 

From 4165a815dae2b3d071dbfda8ff528f8050829933 Mon Sep 17 00:00:00 2001
From: jolan <jolan@jolan.org>
Date: Mon, 18 Jul 2016 10:19:38 -0500
Subject: [PATCH 013/150] Fix bugs in stratum pool mining.

This includes improvements to the testserver and some test code with
data taken from sgminer.

Code by jcv, jolan, and cj
---
 device.go  |  72 +++++++++++++++++++++++++++----------
 getwork.go |   5 ++-
 stratum.go | 102 +++++++++++++++++++++++------------------------------
 3 files changed, 101 insertions(+), 78 deletions(-)

diff --git a/device.go b/device.go
index 0d63402..2b66554 100644
--- a/device.go
+++ b/device.go
@@ -55,7 +55,8 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 
 type Work struct {
 	Data   [192]byte
-	Target [32]byte
+	Target *big.Int
+	Nonce2 uint32
 }
 
 type Device struct {
@@ -208,30 +209,66 @@ func (d *Device) updateCurrentWork() {
 	d.hasWork = true
 
 	d.work = *w
-
+	minrLog.Tracef("pre-nonce: %v", hex.EncodeToString(d.work.Data[:]))
 	// Set nonce2
-	binary.BigEndian.PutUint32(d.work.Data[128+4*nonce2Word:], uint32(d.index))
-
+	binary.LittleEndian.PutUint32(d.work.Data[124+4*nonce2Word:], d.work.Nonce2)
 	// Reset the hash state
 	copy(d.midstate[:], blake256.IV256[:])
 
 	// Hash the two first blocks
 	blake256.Block(d.midstate[:], d.work.Data[0:64], 512)
 	blake256.Block(d.midstate[:], d.work.Data[64:128], 1024)
+	minrLog.Tracef("midstate input data %v", hex.EncodeToString(d.work.Data[0:128]))
 
 	// Convert the next block to uint32 array.
 	for i := 0; i < 16; i++ {
 		d.lastBlock[i] = binary.BigEndian.Uint32(d.work.Data[128+i*4 : 132+i*4])
+		//minrLog.Tracef("lastblockin %v: %v", i, d.lastBlock[i])
 	}
+	minrLog.Tracef("data: %v", hex.EncodeToString(d.work.Data[:]))
 }
 
 func (d *Device) Run() {
+	//d.testFoundCandidate()
+	//return
 	err := d.runDevice()
 	if err != nil {
 		minrLog.Errorf("Error on device: %v", err)
 	}
 }
 
+// testFoundCandidate has some hardcoded data to match up with sgminer.
+func (d *Device) testFoundCandidate() {
+	n1 := uint32(33554432)
+	n0 := uint32(7245027)
+
+	d.midstate[0] = uint32(2421507776)
+	d.midstate[1] = uint32(2099684366)
+	d.midstate[2] = uint32(8033620)
+	d.midstate[3] = uint32(950943511)
+	d.midstate[4] = uint32(2489053653)
+	d.midstate[5] = uint32(3357747798)
+	d.midstate[6] = uint32(2534384973)
+	d.midstate[7] = uint32(2947973092)
+
+	target, _ := hex.DecodeString("00000000ffff0000000000000000000000000000000000000000000000000000")
+	bigTarget := new(big.Int)
+	bigTarget.SetString(hex.EncodeToString(target), 16)
+	d.work.Target = bigTarget
+
+	data, _ := hex.DecodeString("01000000509a3b7c65f8986a464c0e82ec5ca6aaf18cf13787507cbfc20a000000000000a455f69725e9c8623baa3c9c5a708aefb947702dc2b620b4c10129977e104c0275571a5ca5b1308b075fe74224504c9e6b1153f3de97235e7a8c7e58ea8f1c55010086a1d41fb3ee05000000fda400004a33121a2db33e1101000000abae0000260800008ec78357000000000000000000a461f2e3014335000000000000000000000000000000000000000000000000000000000000000000000000")
+	copy(d.work.Data[:], data)
+
+	minrLog.Errorf("data: %v", d.work.Data)
+	minrLog.Errorf("target: %v", d.work.Target)
+	minrLog.Errorf("nonce1 %x, nonce0: %x", n1, n0)
+
+	d.foundCandidate(n1, n0)
+	//need to match
+	//00000000df6ffb6059643a9215f95751baa7b1ed8aa93edfeb9a560ecb1d5884
+	//stratum submit {"params": ["test", "76df", "0200000000a461f2e3014335", "5783c78e", "e38c6e00"], "id": 4, "method": "mining.submit"}
+}
+
 func (d *Device) runDevice() error {
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
@@ -248,7 +285,11 @@ func (d *Device) runDevice() error {
 		}
 
 		// Increment nonce1
-		d.lastBlock[nonce1Word]++
+		//d.lastBlock[nonce1Word]++
+		d.work.Nonce2++
+		var tmpBytes = make([]byte, 4)
+		binary.LittleEndian.PutUint32(tmpBytes, d.work.Nonce2)
+		d.lastBlock[nonce1Word] = binary.BigEndian.Uint32(tmpBytes)
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
@@ -259,6 +300,7 @@ func (d *Device) runDevice() error {
 
 		// args 1..8: midstate
 		for i := 0; i < 8; i++ {
+			//minrLog.Tracef("mid: %v: %v", i+1, d.midstate[i])
 			ms := d.midstate[i]
 			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1), uint32Size, unsafe.Pointer(&ms))
 			if status != cl.CL_SUCCESS {
@@ -273,6 +315,7 @@ func (d *Device) runDevice() error {
 				i2++
 			}
 			lb := d.lastBlock[i2]
+			//minrLog.Tracef("lastblockused: %v: %v", i+9, lb)
 			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9), uint32Size, unsafe.Pointer(&lb))
 			if status != cl.CL_SUCCESS {
 				return clError(status, "CLSetKernelArg")
@@ -318,26 +361,17 @@ func (d *Device) foundCandidate(nonce1 uint32, nonce0 uint32) {
 	copy(data, d.work.Data[:])
 	binary.BigEndian.PutUint32(data[128+4*nonce1Word:], nonce1)
 	binary.BigEndian.PutUint32(data[128+4*nonce0Word:], nonce0)
-
-	// Perform the final hash block to get the hash
-	var state [8]uint32
-	copy(state[:], d.midstate[:])
-	blake256.Block(state[:], data[128:192], 1440)
-
-	var hash [32]byte
-	for i := 0; i < 8; i++ {
-		binary.BigEndian.PutUint32(hash[i*4:], state[i])
-	}
+	hash := chainhash.HashFuncB(data[0:180])
 
 	newHash, err := chainhash.NewHashFromStr(hex.EncodeToString(reverse(hash[:])))
 	if err != nil {
 		minrLog.Error(err)
 	}
+	minrLog.Errorf("hash: %x", hash)
+	minrLog.Errorf("newHash: %v", newHash)
 	hashNum := blockchain.ShaHashToBig(newHash)
-	target := new(big.Int)
-	target.SetString(hex.EncodeToString(reverse(d.work.Target[:])), 16)
-	if hashNum.Cmp(target) > 0 {
-		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(reverse(hash[:])), hex.EncodeToString(reverse(d.work.Target[:])))
+	if hashNum.Cmp(d.work.Target) > 0 {
+		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(reverse(hash[:])), d.work.Target)
 
 	} else {
 		minrLog.Infof("Found hash!!  %s", hex.EncodeToString(hash[:]))
diff --git a/getwork.go b/getwork.go
index 440fb1a..153168d 100644
--- a/getwork.go
+++ b/getwork.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"math/big"
 	"net"
 	"net/http"
 	"strconv"
@@ -174,10 +175,12 @@ func GetWork() (*Work, error) {
 	if len(target) != 32 {
 		return nil, fmt.Errorf("Wrong target length: got %d, expected 32", len(target))
 	}
+	bigTarget := new(big.Int)
+	bigTarget.SetString(hex.EncodeToString(target), 16)
 
 	var w Work
 	copy(w.Data[:], data)
-	copy(w.Target[:], target)
+	w.Target = bigTarget
 	return &w, nil
 }
 
diff --git a/stratum.go b/stratum.go
index f0a879d..bd0819e 100644
--- a/stratum.go
+++ b/stratum.go
@@ -21,9 +21,12 @@ import (
 
 	"github.com/davecgh/go-spew/spew"
 
+	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/wire"
 )
 
+var chainParams = &chaincfg.MainNetParams
+
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
 type Stratum struct {
@@ -37,7 +40,7 @@ type Stratum struct {
 	subID     uint64
 	submitID  uint64
 	Diff      float64
-	Target    string
+	Target    *big.Int
 	submitted bool
 	PoolWork  NotifyWork
 }
@@ -49,6 +52,7 @@ type NotifyWork struct {
 	ExtraNonce1       string
 	ExtraNonce2       uint64
 	ExtraNonce2Length float64
+	Nonce2            uint32
 	CB1               string
 	CB2               string
 	Height            int64
@@ -115,9 +119,9 @@ type NotifyRes struct {
 
 // Submit models a submission message.
 type Submit struct {
-	Method string      `json:"method"`
 	Params []string    `json:"params"`
 	ID     interface{} `json:"id"`
+	Method string      `json:"method"`
 }
 
 // errJsonType is an error for json that we do not expect.
@@ -150,7 +154,7 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 	stratum.authID = 2
 	// Target for share is 1 unless we hear otherwise.
 	stratum.Diff = 1
-	stratum.Target = stratum.diffToTarget(stratum.Diff)
+	stratum.Target = diffToTarget(stratum.Diff)
 	stratum.PoolWork.NewWork = false
 	stratum.Reader = bufio.NewReader(stratum.Conn)
 	go stratum.Listen()
@@ -622,7 +626,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if !ok {
 			return nil, errJsonType
 		}
-		s.Target = s.diffToTarget(difficulty)
+		s.Target = diffToTarget(difficulty)
 		s.Diff = difficulty
 		var nres = StratumMsg{}
 		nres.Method = method
@@ -724,9 +728,9 @@ func (s *Stratum) PrepWork() error {
 		poolLog.Error("Error decoding ExtraNonce2.")
 		return err
 	}
-	poolLog.Debugf("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
+	poolLog.Tracef("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
 	extraNonce := append(en1[:], en2[:]...)
-	poolLog.Debugf("extraNonce %v", extraNonce)
+	poolLog.Tracef("extraNonce %v", extraNonce)
 
 	// Increase extranonce2
 	s.PoolWork.ExtraNonce2++
@@ -788,15 +792,6 @@ func (s *Stratum) PrepWork() error {
 		return err
 	}
 
-	target, err := hex.DecodeString(s.Target)
-	if err != nil {
-		poolLog.Error("Error decoding Target")
-		return err
-	}
-	if len(target) != 32 {
-		return fmt.Errorf("Wrong target length: got %d, expected 32", len(target))
-	}
-
 	data := blockHeader
 	poolLog.Debugf("data0 %v", data)
 	poolLog.Tracef("data len %v", len(data))
@@ -842,15 +837,34 @@ func (s *Stratum) PrepWork() error {
 		poolLog.Errorf("Unable to generate random bytes")
 	}
 	workPosition += 4
+	// XXX would be nice to enable a static 'random' number here for tests
+	//binary.LittleEndian.PutUint32(randomBytes, 4066485248)
+	poolLog.Tracef("Random data: %v at: %v", randomBytes, workPosition)
 	copy(workdata[workPosition:], randomBytes)
 
 	poolLog.Debugf("workdata len %v", len(workdata))
 	poolLog.Tracef("workdata %v", hex.EncodeToString(workdata[:]))
 
 	var w Work
+	/*var empty = []byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+	}
+	copy(w.Data[:], empty[:])*/
 	copy(w.Data[:], workdata[:])
-	copy(w.Target[:], target)
-	poolLog.Tracef("final data %v, target %v", hex.EncodeToString(data), hex.EncodeToString(target))
+	w.Target = s.Target
+	w.Nonce2 = s.PoolWork.Nonce2
+	poolLog.Tracef("final data %v, target %v", hex.EncodeToString(w.Data[:]), w.Target)
 	s.PoolWork.Work = &w
 	return nil
 
@@ -896,9 +910,9 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 		poolLog.Error("Error decoding ExtraNonce2.")
 		//return err
 	}
-	poolLog.Tracef("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
+	poolLog.Errorf("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
 	extraNonce := append(en1[:], en2[:]...)
-	poolLog.Tracef("extraNonce %v", extraNonce)
+	poolLog.Errorf("extraNonce %v", extraNonce)
 
 	s.ID++
 	sub.ID = s.ID
@@ -909,9 +923,11 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 
 	poolLog.Tracef("raw User %v JobId %v xnonce2 %v xnonce2length %v time %v nonce %v", s.User, s.PoolWork.JobID, s.PoolWork.ExtraNonce2, s.PoolWork.ExtraNonce2Length, submittedHeader.Timestamp, submittedHeader.Nonce)
 
-	poolLog.Tracef("encoded User %v JobId %v xnonce2 %v time %v nonce %v", s.User, s.PoolWork.JobID, en2, string(time), nonce)
+	xnonce2str := hex.EncodeToString(data[144:156])
+
+	poolLog.Tracef("encoded User %v JobId %v xnonce2 %v time %v nonce %v", s.User, s.PoolWork.JobID, xnonce2str, string(time), nonce)
 
-	sub.Params = []string{s.User, s.PoolWork.JobID, hex.EncodeToString(en2), s.PoolWork.Ntime, nonce}
+	sub.Params = []string{s.User, s.PoolWork.JobID, xnonce2str, s.PoolWork.Ntime, nonce}
 	// pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr
 
 	return sub, nil
@@ -948,44 +964,14 @@ func reverseToInt(s string) (int32, error) {
 	return int32(i), err
 }
 
-func (s *Stratum) diffToTarget(diff float64) string {
-	// diff/0 would be bad.
-	if s.Diff == 0 {
-		s.Diff = 1
-	}
-	// Also if diff wasn't set properly go with default
-	// rather then end if div by 0.
-	if diff == 0 {
-		diff = 1
-	}
-	diffNew := int64(diff / s.Diff)
-	_, targetHex := s.getTargetHex(diffNew)
-	return targetHex
-}
-
-// Adapted from https://github.com/sammy007/go-cryptonote-pool.git
-func (s *Stratum) getTargetHex(diff int64) (uint32, string) {
-	var Diff1 *big.Int
-	Diff1 = new(big.Int)
-	Diff1.SetString("00000000FFFF0000000000000000000000000000000000000000000000000000", 16)
-
-	padded := make([]byte, 32)
-
-	diff2 := new(big.Int)
-	diff2.SetInt64(int64(diff))
-
-	diff3 := new(big.Int)
-	diff3 = diff3.Div(Diff1, diff2)
-
-	diffBuff := diff3.Bytes()
-	copy(padded[32-len(diffBuff):], diffBuff)
-	buff := padded[0:32]
-	var target uint32
-	targetBuff := bytes.NewReader(buff)
-	binary.Read(targetBuff, binary.LittleEndian, &target)
-	targetHex := hex.EncodeToString(buff)
+// diffToTarget converts a whole number difficulty into a target.
+func diffToTarget(diff float64) *big.Int {
+	divisor := new(big.Int).SetInt64(int64(diff))
+	max := chainParams.PowLimit
+	target := new(big.Int)
+	target.Div(max, divisor)
 
-	return target, targetHex
+	return target
 }
 
 func reverse(src []byte) []byte {

From 2acc93db6199cb9083a08109b653b1fe27147cb4 Mon Sep 17 00:00:00 2001
From: cjepson <cjepson@decred.org>
Date: Fri, 22 Jul 2016 12:43:26 -0400
Subject: [PATCH 014/150] Fix stratum mining and intensity

Stratum mining has been made operational. Intensity was a float64;
it has been changed to a uint32 and alternatively the user may elect
to instead directly set the work size, which must be a multiple of
32.
---
 blake256.cl |  15 ++-
 config.go   |  80 ++++++++++++--
 device.go   | 297 ++++++++++++++++++++++++++++++++++++----------------
 getwork.go  |  30 +++---
 miner.go    | 102 +++++++++++++++---
 stratum.go  | 236 +++++++++++++++++++----------------------
 6 files changed, 499 insertions(+), 261 deletions(-)

diff --git a/blake256.cl b/blake256.cl
index 0b3c7d8..148b1e0 100644
--- a/blake256.cl
+++ b/blake256.cl
@@ -54,8 +54,8 @@ __kernel void search(
 	const uint MB,
 	const uint MC
 )
-{
-	/* Load the block header and padding */
+{	
+	// Load the block header and padding.
 	const uint M3 = get_global_id(0);
 	const uint MD = 0x80000001UL;
 	const uint ME = 0x00000000UL;
@@ -82,7 +82,7 @@ __kernel void search(
 	uint V8, V9, VA, VB, VC, VD, VE, VF;
 	uint pre7;
 
-	/* Load the midstate and initialize */
+	// Load the midstate and initialize.
 	V0 = h0;
 	V1 = h1;
 	V2 = h2;
@@ -101,8 +101,7 @@ __kernel void search(
 	VE = cst6;
 	VF = cst7;
 
-	/* 14 rounds */
-
+	// 14 rounds. 
 	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
 	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
 	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
@@ -154,9 +153,9 @@ __kernel void search(
 	}
 	*/
 
+        // Push this share.
 	if (pre7 ^ V7 ^ VF) return;
-
-	/* Push this share */
-	//output[output[0xFF]++] = M3;
+        
+	// Update nonce.
 	output[++output[0]] = M3;
 }
diff --git a/config.go b/config.go
index 95b56a4..d3c1685 100644
--- a/config.go
+++ b/config.go
@@ -8,7 +8,9 @@ import (
 	"net"
 	"os"
 	"path/filepath"
+	"reflect"
 	"sort"
+	"strconv"
 	"strings"
 
 	"github.com/btcsuite/go-flags"
@@ -30,10 +32,13 @@ var (
 	defaultRPCServer   = "localhost"
 	defaultRPCCertFile = filepath.Join(dcrdHomeDir, "rpc.cert")
 	defaultLogDir      = filepath.Join(minerHomeDir, defaultLogDirname)
-	defaultIntensity   = 26
+	defaultIntensity   = []string{}
+	defaultWorkSize    = []string{}
+
 	// Took these values from cgminer.
 	minIntensity = 8
 	maxIntensity = 31
+	maxWorkSize  = 0xFFFFFFFF
 )
 
 type config struct {
@@ -66,10 +71,13 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
-	Intensity int `short:"i" long:"intensity" description:"Intensity."`
+	Intensity     []string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device, use multiple flags for multiple devices"`
+	IntensityInts []int
+	WorkSize      []string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity), use multiple flags for multiple devices"`
+	WorkSizeInts  []int
 
 	// Pool related options
-	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port) "`
+	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port)"`
 	PoolUser     string `short:"m" long:"pooluser" description:"Pool username"`
 	PoolPassword string `short:"n" long:"poolpass" default-mask:"-" description:"Pool password"`
 }
@@ -224,6 +232,7 @@ func loadConfig() (*config, []string, error) {
 		RPCCert:    defaultRPCCertFile,
 		Intensity:  defaultIntensity,
 		ClKernel:   defaultClKernel,
+		WorkSize:   defaultWorkSize,
 	}
 
 	// Create the home directory if it doesn't already exist.
@@ -293,13 +302,72 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
-	if (cfg.Intensity < minIntensity) || (cfg.Intensity > maxIntensity) {
-		err := fmt.Errorf("Intensity %v not without range %v to %v.",
-			cfg.Intensity, minIntensity, maxIntensity)
+	// The intensity or worksize must be set by the user.
+	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
+		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		err := fmt.Errorf("Intensity or work size must be set")
 		fmt.Fprintln(os.Stderr, err)
 		return nil, nil, err
 	}
 
+	// Check the intensities if the user is setting that.
+	cfg.IntensityInts = make([]int, len(cfg.Intensity))
+	if !reflect.DeepEqual(cfg.Intensity, defaultIntensity) {
+		for i := range cfg.Intensity {
+			var err error
+			cfg.IntensityInts[i], err = strconv.Atoi(cfg.Intensity[i])
+			if err != nil {
+				err := fmt.Errorf("Could not convert intensity number %v "+
+					"(%v) to int: %s", i, cfg.Intensity[i], err.Error())
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+
+			if (cfg.IntensityInts[i] < minIntensity) ||
+				(cfg.IntensityInts[i] > maxIntensity) {
+				err := fmt.Errorf("Intensity %v (device %v) not within "+
+					"range %v to %v.", cfg.IntensityInts[i], i, minIntensity,
+					maxIntensity)
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+		}
+	}
+
+	// Check the work size.
+	cfg.WorkSizeInts = make([]int, len(cfg.WorkSize))
+	if !reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		for i := range cfg.WorkSize {
+			var err error
+			cfg.WorkSizeInts[i], err = strconv.Atoi(cfg.WorkSize[i])
+			if err != nil {
+				err := fmt.Errorf("Could not convert work size number %v "+
+					"(%v) to int: %s", i, cfg.Intensity[i], err.Error())
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+
+			if cfg.WorkSizeInts[i] < 0 {
+				err := fmt.Errorf("Zero or negative WorkSize passed: %v",
+					cfg.WorkSizeInts[i])
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+			if cfg.WorkSizeInts[i] > maxWorkSize {
+				err := fmt.Errorf("Too big WorkSize passed: %v, max %v",
+					cfg.WorkSizeInts[i], maxWorkSize)
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+			if cfg.WorkSizeInts[i]%256 != 0 {
+				err := fmt.Errorf("Work size %v not a multiple of 256",
+					cfg.WorkSizeInts[i])
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+		}
+	}
+
 	// Special show command to list supported subsystems and exit.
 	if cfg.DebugLevel == "show" {
 		fmt.Println("Supported subsystems", supportedSubsystems())
diff --git a/device.go b/device.go
index 2b66554..08f6850 100644
--- a/device.go
+++ b/device.go
@@ -1,12 +1,16 @@
 package main
 
 import (
+	"bytes"
 	"encoding/binary"
 	"encoding/hex"
 	"fmt"
+	"io"
 	"math"
 	"math/big"
 	"os"
+	"reflect"
+	"time"
 	"unsafe"
 
 	"github.com/decred/dcrd/blockchain"
@@ -21,9 +25,10 @@ const (
 	localWorksize    = 64
 	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
 
-	nonce0Word = 3
-	nonce1Word = 4
-	nonce2Word = 5
+	timestampWord = 2
+	nonce0Word    = 3
+	nonce1Word    = 4
+	nonce2Word    = 5
 )
 
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
@@ -32,31 +37,48 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	var program_buffer [1][]byte
 	var program_size [1]cl.CL_size_t
 
-	/* Read each program file and place content into buffer array */
+	// Read each program file and place content into buffer array.
 	program_handle, err := os.Open(filename)
 	if err != nil {
 		return nil, nil, err
 	}
 	defer program_handle.Close()
 
-	fi, err := program_handle.Stat()
+	buf := bytes.NewBuffer(nil)
+	_, err = io.Copy(buf, program_handle)
 	if err != nil {
 		return nil, nil, err
 	}
-	program_size[0] = cl.CL_size_t(fi.Size())
+	str := string(buf.Bytes())
+	program_final := []byte(str)
+
+	program_size[0] = cl.CL_size_t(len(program_final))
 	program_buffer[0] = make([]byte, program_size[0])
-	read_size, err := program_handle.Read(program_buffer[0])
-	if err != nil || cl.CL_size_t(read_size) != program_size[0] {
-		return nil, nil, err
+	for i := range program_final {
+		program_buffer[0][i] = program_final[i]
 	}
 
 	return program_buffer[:], program_size[:], nil
 }
 
+// NewWork is the constructor for work.
+func NewWork(data [192]byte, target *big.Int, jobTime uint32, timeReceived uint32,
+	isSolo bool) *Work {
+	return &Work{
+		Data:         data,
+		Target:       target,
+		JobTime:      jobTime,
+		TimeReceived: timeReceived,
+		isSolo:       isSolo,
+	}
+}
+
 type Work struct {
-	Data   [192]byte
-	Target *big.Int
-	Nonce2 uint32
+	Data         [192]byte
+	Target       *big.Int
+	JobTime      uint32
+	TimeReceived uint32
+	isSolo       bool
 }
 
 type Device struct {
@@ -70,6 +92,14 @@ type Device struct {
 	program      cl.CL_program
 	kernel       cl.CL_kernel
 
+	// extraNonce is the device extraNonce, where the first
+	// byte is the device ID (supporting up to 255 devices)
+	// while the last 3 bytes is the extraNonce value. If
+	// the extraNonce goes through all 0x??FFFFFF values,
+	// it will reset to 0x??000000.
+	extraNonce    uint32
+	currentWorkID uint32
+
 	midstate  [8]uint32
 	lastBlock [16]uint32
 
@@ -78,32 +108,41 @@ type Device struct {
 	workDone chan []byte
 	hasWork  bool
 
-	workDoneEMA   float64
-	workDoneLast  float64
-	workDoneTotal float64
-	runningTime   float64
+	started          uint32
+	allDiffOneShares uint64
+	validShares      uint64
+	invalidShares    uint64
 
 	quit chan struct{}
 }
 
-// Compares a and b as big endian
-func hashSmaller(a, b []byte) bool {
-	for i := len(a) - 1; i >= 0; i-- {
-		if a[i] < b[i] {
-			return true
-		}
-		if a[i] > b[i] {
-			return false
-		}
+// Uint32EndiannessSwap swaps the endianness of a uint32.
+func Uint32EndiannessSwap(v uint32) uint32 {
+	return (v&0x000000FF)<<24 | (v&0x0000FF00)<<8 |
+		(v&0x00FF0000)>>8 | (v&0xFF000000)>>24
+}
+
+// rolloverExtraNonce rolls over the extraNonce if it goes over 0x00FFFFFF many
+// hashes, since the first byte is reserved for the ID.
+func rolloverExtraNonce(v *uint32) {
+	if *v&0x00FFFFFF == 0x00FFFFFF {
+		*v = *v & 0xFF000000
+	} else {
+		*v++
 	}
-	return false
 }
 
 func clError(status cl.CL_int, f string) error {
-	return fmt.Errorf("%s returned error %s (%d)", f, cl.ERROR_CODES_STRINGS[-status], status)
+	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
+		return fmt.Errorf("%s returned unknown error!")
+	}
+
+	return fmt.Errorf("%s returned error %s (%d)", f,
+		cl.ERROR_CODES_STRINGS[-status], status)
 }
 
-func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id, workDone chan []byte) (*Device, error) {
+func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
+	workDone chan []byte) (*Device, error) {
 	d := &Device{
 		index:      index,
 		platformID: platformID,
@@ -116,65 +155,75 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 
 	var status cl.CL_int
 
-	// Create the CL context
-	d.context = cl.CLCreateContext(nil, 1, []cl.CL_device_id{deviceID}, nil, nil, &status)
+	// Create the CL context.
+	d.context = cl.CLCreateContext(nil, 1, []cl.CL_device_id{deviceID},
+		nil, nil, &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateContext")
 	}
 
-	// Create the command queue
+	// Create the command queue.
 	d.queue = cl.CLCreateCommandQueue(d.context, deviceID, 0, &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateCommandQueue")
 	}
 
-	// Create the output buffer
-	d.outputBuffer = cl.CLCreateBuffer(d.context, cl.CL_MEM_READ_WRITE, uint32Size*outputBufferSize, nil, &status)
+	// Create the output buffer.
+	d.outputBuffer = cl.CLCreateBuffer(d.context, cl.CL_MEM_READ_WRITE,
+		uint32Size*outputBufferSize, nil, &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateBuffer")
 	}
 
-	// Load kernel source
+	// Load kernel source.
 	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
 		return nil, fmt.Errorf("Could not load kernel source: %v", err)
 	}
 
-	// Create the program
-	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:], progSize[:], &status)
+	// Create the program.
+	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:],
+		progSize[:], &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateProgramWithSource")
 	}
 
-	// Build the program for the device
+	// Build the program for the device.
 	compilerOptions := ""
 	compilerOptions += fmt.Sprintf(" -D WORKSIZE=%d", localWorksize)
-	status = cl.CLBuildProgram(d.program, 1, []cl.CL_device_id{deviceID}, []byte(compilerOptions), nil, nil)
+	status = cl.CLBuildProgram(d.program, 1, []cl.CL_device_id{deviceID},
+		[]byte(compilerOptions), nil, nil)
 	if status != cl.CL_SUCCESS {
 		err = clError(status, "CLBuildProgram")
 
 		// Something went wrong! Print what it is.
 		var logSize cl.CL_size_t
-		status = cl.CLGetProgramBuildInfo(d.program, deviceID, cl.CL_PROGRAM_BUILD_LOG, 0, nil, &logSize)
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, 0, nil, &logSize)
 		if status != cl.CL_SUCCESS {
-			minrLog.Errorf("Could not obtain compilation error log: %v", clError(status, "CLGetProgramBuildInfo"))
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
 		}
 		var program_log interface{}
-		status = cl.CLGetProgramBuildInfo(d.program, deviceID, cl.CL_PROGRAM_BUILD_LOG, logSize, &program_log, nil)
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, logSize, &program_log, nil)
 		if status != cl.CL_SUCCESS {
-			minrLog.Errorf("Could not obtain compilation error log: %v", clError(status, "CLGetProgramBuildInfo"))
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
 		}
 		minrLog.Errorf("%s\n", program_log)
 
 		return nil, err
 	}
 
-	// Create the kernel
+	// Create the kernel.
 	d.kernel = cl.CLCreateKernel(d.program, []byte("search"), &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateKernel")
 	}
 
+	d.started = uint32(time.Now().Unix())
+
 	return d, nil
 }
 
@@ -210,27 +259,33 @@ func (d *Device) updateCurrentWork() {
 
 	d.work = *w
 	minrLog.Tracef("pre-nonce: %v", hex.EncodeToString(d.work.Data[:]))
-	// Set nonce2
-	binary.LittleEndian.PutUint32(d.work.Data[124+4*nonce2Word:], d.work.Nonce2)
+
+	// Bump and set the work ID if the work is new.
+	d.currentWorkID++
+	binary.LittleEndian.PutUint32(d.work.Data[128+4*nonce2Word:],
+		d.currentWorkID)
+
 	// Reset the hash state
 	copy(d.midstate[:], blake256.IV256[:])
 
 	// Hash the two first blocks
 	blake256.Block(d.midstate[:], d.work.Data[0:64], 512)
 	blake256.Block(d.midstate[:], d.work.Data[64:128], 1024)
-	minrLog.Tracef("midstate input data %v", hex.EncodeToString(d.work.Data[0:128]))
+	minrLog.Tracef("midstate input data for work update %v",
+		hex.EncodeToString(d.work.Data[0:128]))
 
 	// Convert the next block to uint32 array.
 	for i := 0; i < 16; i++ {
 		d.lastBlock[i] = binary.BigEndian.Uint32(d.work.Data[128+i*4 : 132+i*4])
-		//minrLog.Tracef("lastblockin %v: %v", i, d.lastBlock[i])
 	}
-	minrLog.Tracef("data: %v", hex.EncodeToString(d.work.Data[:]))
+	minrLog.Tracef("work data for work update: %v",
+		hex.EncodeToString(d.work.Data[:]))
 }
 
 func (d *Device) Run() {
 	//d.testFoundCandidate()
 	//return
+
 	err := d.runDevice()
 	if err != nil {
 		minrLog.Errorf("Error on device: %v", err)
@@ -263,7 +318,8 @@ func (d *Device) testFoundCandidate() {
 	minrLog.Errorf("target: %v", d.work.Target)
 	minrLog.Errorf("nonce1 %x, nonce0: %x", n1, n0)
 
-	d.foundCandidate(n1, n0)
+	// d.foundCandidate(n1, n0, ts)
+
 	//need to match
 	//00000000df6ffb6059643a9215f95751baa7b1ed8aa93edfeb9a560ecb1d5884
 	//stratum submit {"params": ["test", "76df", "0200000000a461f2e3014335", "5783c78e", "e38c6e00"], "id": 4, "method": "mining.submit"}
@@ -272,8 +328,25 @@ func (d *Device) testFoundCandidate() {
 func (d *Device) runDevice() error {
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
-	globalWorksize := math.Exp2(float64(cfg.Intensity))
-	minrLog.Debugf("Intensity %v", cfg.Intensity)
+	var globalWorksize uint32
+	if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		globalWorksize = 1 << uint32(cfg.IntensityInts[d.index])
+		minrLog.Debugf("GPU #%d: Intensity %v (work size: %v)", d.index,
+			cfg.IntensityInts[d.index], globalWorksize)
+	} else {
+		globalWorksize = uint32(cfg.WorkSizeInts[d.index])
+		intensity := math.Log2(float64(cfg.WorkSizeInts[d.index]))
+		minrLog.Debugf("GPU #%d: Work size: %v ('intensity' %v)", d.index,
+			cfg.WorkSizeInts[d.index], intensity)
+	}
+
+	// Bump the extraNonce for the device it's running on
+	// when you begin mining. This ensures each GPU is doing
+	// different work. If the extraNonce has already been
+	// set for valid work, restore that.
+	d.extraNonce += uint32(d.index) << 24
+	d.lastBlock[nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+
 	var status cl.CL_int
 	for {
 		d.updateCurrentWork()
@@ -284,25 +357,33 @@ func (d *Device) runDevice() error {
 		default:
 		}
 
-		// Increment nonce1
-		//d.lastBlock[nonce1Word]++
-		d.work.Nonce2++
-		var tmpBytes = make([]byte, 4)
-		binary.LittleEndian.PutUint32(tmpBytes, d.work.Nonce2)
-		d.lastBlock[nonce1Word] = binary.BigEndian.Uint32(tmpBytes)
+		// Increment extraNonce.
+		rolloverExtraNonce(&d.extraNonce)
+		d.lastBlock[nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+
+		// Update the timestamp. Only solo work allows you to roll
+		// the timestamp.
+		ts := d.work.JobTime
+		if d.work.isSolo {
+			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+			ts = d.work.JobTime + diffSeconds
+		}
+		d.lastBlock[timestampWord] = Uint32EndiannessSwap(ts)
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
-		status = cl.CLSetKernelArg(d.kernel, 0, cl.CL_size_t(unsafe.Sizeof(obuf)), unsafe.Pointer(&obuf))
+		status = cl.CLSetKernelArg(d.kernel, 0,
+			cl.CL_size_t(unsafe.Sizeof(obuf)),
+			unsafe.Pointer(&obuf))
 		if status != cl.CL_SUCCESS {
 			return clError(status, "CLSetKernelArg")
 		}
 
 		// args 1..8: midstate
 		for i := 0; i < 8; i++ {
-			//minrLog.Tracef("mid: %v: %v", i+1, d.midstate[i])
 			ms := d.midstate[i]
-			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1), uint32Size, unsafe.Pointer(&ms))
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1),
+				uint32Size, unsafe.Pointer(&ms))
 			if status != cl.CL_SUCCESS {
 				return clError(status, "CLSetKernelArg")
 			}
@@ -315,8 +396,8 @@ func (d *Device) runDevice() error {
 				i2++
 			}
 			lb := d.lastBlock[i2]
-			//minrLog.Tracef("lastblockused: %v: %v", i+9, lb)
-			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9), uint32Size, unsafe.Pointer(&lb))
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9),
+				uint32Size, unsafe.Pointer(&lb))
 			if status != cl.CL_SUCCESS {
 				return clError(status, "CLSetKernelArg")
 			}
@@ -324,57 +405,82 @@ func (d *Device) runDevice() error {
 		}
 
 		// Clear the found count from the buffer
-		status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer, cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]), 0, nil, nil)
+		status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer,
+			cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]),
+			0, nil, nil)
 		if status != cl.CL_SUCCESS {
 			return clError(status, "CLEnqueueWriteBuffer")
 		}
 
-		// Execute the kernel
+		// Execute the kernel and follow its execution time.
+		currentTime := time.Now()
 		var globalWorkSize [1]cl.CL_size_t
 		globalWorkSize[0] = cl.CL_size_t(globalWorksize)
 		var localWorkSize [1]cl.CL_size_t
 		localWorkSize[0] = localWorksize
-		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil, globalWorkSize[:], localWorkSize[:], 0, nil, nil)
+		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
+			globalWorkSize[:], localWorkSize[:], 0, nil, nil)
 		if status != cl.CL_SUCCESS {
 			return clError(status, "CLEnqueueNDRangeKernel")
 		}
 
-		// Read the output buffer
-		cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0, uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0, nil, nil)
+		// Read the output buffer.
+		cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0,
+			uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0,
+			nil, nil)
 		if status != cl.CL_SUCCESS {
 			return clError(status, "CLEnqueueReadBuffer")
 		}
 
 		for i := uint32(0); i < outputData[0]; i++ {
-			minrLog.Debugf("Found candidate: %d", outputData[i+1])
-			d.foundCandidate(d.lastBlock[nonce1Word], outputData[i+1])
+			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
+				"extraNonce %08x, workID %08x, timestamp %08x",
+				d.index, i+1, outputData[i+1], d.lastBlock[nonce1Word],
+				Uint32EndiannessSwap(d.currentWorkID),
+				d.lastBlock[timestampWord])
+
+			// Assess the work. If it's below target, it'll be rejected
+			// here. The mining algorithm currently sends this function any
+			// difficulty 1 shares.
+			d.foundCandidate(d.lastBlock[timestampWord], outputData[i+1],
+				d.lastBlock[nonce1Word])
 		}
 
-		d.workDoneLast += globalWorksize
-		d.workDoneTotal += globalWorksize
+		elapsedTime := time.Since(currentTime)
+		minrLog.Tracef("GPU #%d: Kernel execution to read time: %v", d.index,
+			elapsedTime)
 	}
 }
 
-func (d *Device) foundCandidate(nonce1 uint32, nonce0 uint32) {
-	// Construct the final block header
+func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
+	// Construct the final block header.
 	data := make([]byte, 192)
 	copy(data, d.work.Data[:])
-	binary.BigEndian.PutUint32(data[128+4*nonce1Word:], nonce1)
+	binary.BigEndian.PutUint32(data[128+4*timestampWord:], ts)
 	binary.BigEndian.PutUint32(data[128+4*nonce0Word:], nonce0)
-	hash := chainhash.HashFuncB(data[0:180])
-
-	newHash, err := chainhash.NewHashFromStr(hex.EncodeToString(reverse(hash[:])))
-	if err != nil {
-		minrLog.Error(err)
+	binary.BigEndian.PutUint32(data[128+4*nonce1Word:], nonce1)
+	hash := chainhash.HashFuncH(data[0:180])
+
+	// Hashes that reach this logic and fail the minimal proof of
+	// work check are considered to be hardware errors.
+	hashNum := blockchain.ShaHashToBig(&hash)
+	if hashNum.Cmp(chainParams.PowLimit) > 0 {
+		minrLog.Errorf("GPU #%d: Hardware error found, hash %v above "+
+			"minimum target %032x", d.index, hash, d.work.Target.Bytes())
+		d.invalidShares++
+		return
+	} else {
+		d.allDiffOneShares++
 	}
-	minrLog.Errorf("hash: %x", hash)
-	minrLog.Errorf("newHash: %v", newHash)
-	hashNum := blockchain.ShaHashToBig(newHash)
-	if hashNum.Cmp(d.work.Target) > 0 {
-		minrLog.Infof("Hash %s below target %s", hex.EncodeToString(reverse(hash[:])), d.work.Target)
 
+	// Assess versus the pool or daemon target.
+	if hashNum.Cmp(d.work.Target) > 0 {
+		minrLog.Debugf("GPU #%d: Hash %v bigger than target %032x (boo)",
+			d.index, hash, d.work.Target.Bytes())
 	} else {
-		minrLog.Infof("Found hash!!  %s", hex.EncodeToString(hash[:]))
+		minrLog.Infof("GPU #%d: Found hash with work below target! %v (yay)",
+			d.index, hash)
+		d.validShares++
 		d.workDone <- data
 	}
 }
@@ -426,11 +532,20 @@ func getDeviceInfo(id cl.CL_device_id,
 }
 
 func (d *Device) PrintStats() {
-	alpha := 0.95
-	d.workDoneEMA = d.workDoneEMA*alpha + d.workDoneLast*(1-alpha)
-	d.workDoneLast = 0
-	d.runningTime += 5.0
+	secondsElapsed := uint32(time.Now().Unix()) - d.started
+	if secondsElapsed == 0 {
+		return
+	}
 
-	minrLog.Infof("GPU #%d: %s, EMA %s avg %s", d.index, d.deviceName,
-		formatHashrate(d.workDoneEMA), formatHashrate(d.workDoneTotal/d.runningTime))
+	diffOneShareHashesAvg := uint64(0x00000000FFFFFFFF)
+	averageHashRate := (float64(diffOneShareHashesAvg) *
+		float64(d.allDiffOneShares)) /
+		float64(secondsElapsed)
+
+	minrLog.Infof("GPU #%d (%s) reporting average hash rate %v, %v/%v valid work",
+		d.index,
+		d.deviceName,
+		formatHashrate(averageHashRate),
+		d.validShares,
+		d.validShares+d.invalidShares)
 }
diff --git a/getwork.go b/getwork.go
index 153168d..fcb2231 100644
--- a/getwork.go
+++ b/getwork.go
@@ -158,7 +158,8 @@ func GetWork() (*Work, error) {
 	}
 
 	if res.Error != nil {
-		return nil, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code, res.Error.Message)
+		return nil, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code,
+			res.Error.Message)
 	}
 
 	data, err := hex.DecodeString(res.Result.Data)
@@ -166,14 +167,16 @@ func GetWork() (*Work, error) {
 		return nil, err
 	}
 	if len(data) != 192 {
-		return nil, fmt.Errorf("Wrong data length: got %d, expected 192", len(data))
+		return nil, fmt.Errorf("Wrong data length: got %d, expected 192",
+			len(data))
 	}
 	target, err := hex.DecodeString(res.Result.Target)
 	if err != nil {
 		return nil, err
 	}
 	if len(target) != 32 {
-		return nil, fmt.Errorf("Wrong target length: got %d, expected 32", len(target))
+		return nil, fmt.Errorf("Wrong target length: got %d, expected 32",
+			len(target))
 	}
 	bigTarget := new(big.Int)
 	bigTarget.SetString(hex.EncodeToString(target), 16)
@@ -202,7 +205,7 @@ func GetPoolWork(pool *Stratum) (*Work, error) {
 		}
 
 		intJob, _ := strconv.ParseInt(pool.PoolWork.JobID, 16, 0)
-		poolLog.Infof("job %v height %v", intJob, pool.PoolWork.Height)
+		poolLog.Debugf("new job %v height %v", intJob, pool.PoolWork.Height)
 
 		return pool.PoolWork.Work, nil
 	}
@@ -224,7 +227,8 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	}
 	url := protocol + "://" + cfg.RPCServer
 	hexData := hex.EncodeToString(data)
-	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": ["` + hexData + `"], "id": 1}`)
+	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": ["` +
+		hexData + `"], "id": 1}`)
 	bodyBuff := bytes.NewBuffer(jsonStr)
 	httpRequest, err := http.NewRequest("POST", url, bodyBuff)
 	if err != nil {
@@ -255,7 +259,8 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	}
 
 	if httpResponse.Status != "200 OK" {
-		return false, fmt.Errorf("error calling getwork (%s): %s", httpResponse.Status, body)
+		return false, fmt.Errorf("error calling getwork (%s): %s",
+			httpResponse.Status, body)
 	}
 
 	var res getWorkSubmitResponseJson
@@ -265,7 +270,8 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	}
 
 	if res.Error != nil {
-		return false, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code, res.Error.Message)
+		return false, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code,
+			res.Error.Message)
 	}
 
 	return res.Result, nil
@@ -273,19 +279,18 @@ func GetWorkSubmit(data []byte) (bool, error) {
 
 // GetPoolWorkSubmit sends the result to the stratum enabled pool
 func GetPoolWorkSubmit(data []byte, pool *Stratum) (bool, error) {
-
 	sub, err := pool.PrepSubmit(data)
 	if err != nil {
 		return false, err
 	}
 
-	// json encode
+	// JSON encode.
 	m, err := json.Marshal(sub)
 	if err != nil {
 		return false, err
 	}
 
-	// send
+	// Send.
 	poolLog.Tracef("> %s", m)
 	_, err = pool.Conn.Write(m)
 	if err != nil {
@@ -295,9 +300,8 @@ func GetPoolWorkSubmit(data []byte, pool *Stratum) (bool, error) {
 	if err != nil {
 		return false, err
 	}
-	pool.submitted = true
 
-	pool.PoolWork.Work = nil
+	pool.submitted = true
 
-	return false, nil
+	return true, nil
 }
diff --git a/miner.go b/miner.go
index 479ca27..29ae241 100644
--- a/miner.go
+++ b/miner.go
@@ -2,7 +2,9 @@ package main
 
 import (
 	"fmt"
+	"reflect"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/decred/gominer/cl"
@@ -25,12 +27,14 @@ func getCLPlatforms() ([]cl.CL_platform_id, error) {
 // getCLDevices returns the list of devices for the given platform.
 func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
 	var numDevices cl.CL_uint
-	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices)
+	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_GPU, 0, nil,
+		&numDevices)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLGetDeviceIDs")
 	}
 	devices := make([]cl.CL_device_id, numDevices)
-	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices, devices, nil)
+	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
+		devices, nil)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLGetDeviceIDs")
 	}
@@ -44,6 +48,11 @@ type Miner struct {
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
 	pool             *Stratum
+
+	started       uint32
+	validShares   uint64
+	staleShares   uint64
+	invalidShares uint64
 }
 
 func NewMiner() (*Miner, error) {
@@ -72,6 +81,21 @@ func NewMiner() (*Miner, error) {
 		return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
 	}
 
+	// Check the number of intensities/work sizes versus the number of devices.
+	if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		if len(cfg.Intensity) != len(deviceIDs) {
+			return nil, fmt.Errorf("Intensities supplied, but number supplied "+
+				"did not match the number of GPUs (got %v, want %v)",
+				len(cfg.Intensity), len(deviceIDs))
+		}
+	} else {
+		if len(cfg.WorkSize) != len(deviceIDs) {
+			return nil, fmt.Errorf("WorkSize supplied, but number supplied "+
+				"did not match the number of GPUs (got %v, want %v)",
+				len(cfg.WorkSize), len(deviceIDs))
+		}
+	}
+
 	m.devices = make([]*Device, len(deviceIDs))
 	for i, deviceID := range deviceIDs {
 		var err error
@@ -81,6 +105,8 @@ func NewMiner() (*Miner, error) {
 		}
 	}
 
+	m.started = uint32(time.Now().Unix())
+
 	return m, nil
 }
 
@@ -96,17 +122,56 @@ func (m *Miner) workSubmitThread() {
 			if m.pool == nil {
 				accepted, err := GetWorkSubmit(data)
 				if err != nil {
+					inval := atomic.LoadUint64(&m.invalidShares)
+					inval++
+					atomic.StoreUint64(&m.invalidShares, inval)
+
 					minrLog.Errorf("Error submitting work: %v", err)
 				} else {
-					minrLog.Errorf("Submitted work successfully: %v", accepted)
+					if accepted {
+						val := atomic.LoadUint64(&m.validShares)
+						val++
+						atomic.StoreUint64(&m.validShares, val)
+
+						minrLog.Debugf("Submitted work successfully: %v",
+							accepted)
+					} else {
+						inval := atomic.LoadUint64(&m.invalidShares)
+						inval++
+						atomic.StoreUint64(&m.invalidShares, inval)
+					}
+
 					m.needsWorkRefresh <- struct{}{}
 				}
 			} else {
 				accepted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
-					minrLog.Errorf("Error submitting work to pool: %v", err)
+					if err == ErrStatumStaleWork {
+						stale := atomic.LoadUint64(&m.staleShares)
+						stale++
+						atomic.StoreUint64(&m.staleShares, stale)
+					} else {
+						inval := atomic.LoadUint64(&m.invalidShares)
+						inval++
+						atomic.StoreUint64(&m.invalidShares, inval)
+
+						minrLog.Errorf("Error submitting work to pool: %v", err)
+					}
 				} else {
-					minrLog.Errorf("Submitted work to pool successfully: %v", accepted)
+					if accepted {
+						val := atomic.LoadUint64(&m.validShares)
+						val++
+						atomic.StoreUint64(&m.validShares, val)
+
+						minrLog.Debugf("Submitted work to pool successfully: %v",
+							accepted)
+					} else {
+						inval := atomic.LoadUint64(&m.invalidShares)
+						inval++
+						atomic.StoreUint64(&m.invalidShares, inval)
+
+						m.invalidShares++
+					}
 					m.needsWorkRefresh <- struct{}{}
 				}
 			}
@@ -117,7 +182,7 @@ func (m *Miner) workSubmitThread() {
 func (m *Miner) workRefreshThread() {
 	defer m.wg.Done()
 
-	t := time.NewTicker(time.Second)
+	t := time.NewTicker(100 * time.Millisecond)
 	defer t.Stop()
 
 	for {
@@ -132,12 +197,14 @@ func (m *Miner) workRefreshThread() {
 				}
 			}
 		} else {
-			work, err := GetPoolWork(m.pool)
-			if err != nil {
-				minrLog.Errorf("Error in getpoolwork: %v", err)
-			} else {
-				for _, d := range m.devices {
-					d.SetWork(work)
+			if m.pool.PoolWork.NewWork {
+				work, err := GetPoolWork(m.pool)
+				if err != nil {
+					minrLog.Errorf("Error in getpoolwork: %v", err)
+				} else {
+					for _, d := range m.devices {
+						d.SetWork(work)
+					}
 				}
 			}
 		}
@@ -157,6 +224,17 @@ func (m *Miner) printStatsThread() {
 	defer t.Stop()
 
 	for {
+		valid := atomic.LoadUint64(&m.validShares)
+		minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
+			valid,
+			atomic.LoadUint64(&m.invalidShares),
+			atomic.LoadUint64(&m.staleShares))
+
+		secondsElapsed := uint32(time.Now().Unix()) - m.started
+		if (secondsElapsed / 60) > 0 {
+			utility := float64(valid) / (float64(secondsElapsed) / float64(60))
+			minrLog.Infof("Global utility (accepted shares/min): %v", utility)
+		}
 		for _, d := range m.devices {
 			d.PrintStats()
 		}
diff --git a/stratum.go b/stratum.go
index bd0819e..c1f8cfd 100644
--- a/stratum.go
+++ b/stratum.go
@@ -17,32 +17,38 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	"github.com/davecgh/go-spew/spew"
 
 	"github.com/decred/dcrd/chaincfg"
+	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/wire"
 )
 
 var chainParams = &chaincfg.MainNetParams
 
+// ErrStatumStaleWork indicates that the work to send to the pool was stale.
+var ErrStatumStaleWork = fmt.Errorf("Stale work, throwing away")
+
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
 type Stratum struct {
-	Pool      string
-	User      string
-	Pass      string
-	Conn      net.Conn
-	Reader    *bufio.Reader
-	ID        uint64
-	authID    uint64
-	subID     uint64
-	submitID  uint64
-	Diff      float64
-	Target    *big.Int
-	submitted bool
-	PoolWork  NotifyWork
+	Pool          string
+	User          string
+	Pass          string
+	Conn          net.Conn
+	Reader        *bufio.Reader
+	ID            uint64
+	authID        uint64
+	subID         uint64
+	submitID      uint64
+	Diff          float64
+	Target        *big.Int
+	submitted     bool
+	PoolWork      NotifyWork
+	latestJobTime uint32
 }
 
 // NotifyWork holds all the info recieved from a mining.notify message along
@@ -148,10 +154,12 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 	stratum.Pool = pool
 	stratum.User = user
 	stratum.Pass = pass
+
 	// We will set it for sure later but this really should be the value and
 	// setting it here will prevent so incorrect matches based on the
 	// default 0 value.
 	stratum.authID = 2
+
 	// Target for share is 1 unless we hear otherwise.
 	stratum.Diff = 1
 	stratum.Target = diffToTarget(stratum.Diff)
@@ -217,12 +225,14 @@ func (s *Stratum) Listen() {
 			}
 			continue
 		}
+
 		poolLog.Debug(strings.TrimSuffix(result, "\n"))
 		resp, err := s.Unmarshal([]byte(result))
 		if err != nil {
 			poolLog.Error(err)
 			continue
 		}
+
 		switch resp.(type) {
 		case *BasicReply:
 			aResp := resp.(*BasicReply)
@@ -241,6 +251,7 @@ func (s *Stratum) Listen() {
 				}
 				s.submitted = false
 			}
+
 		case StratumMsg:
 			nResp := resp.(StratumMsg)
 			poolLog.Trace(nResp)
@@ -267,6 +278,7 @@ func (s *Stratum) Listen() {
 					// the channel to end everything.
 					return
 				}
+
 			case "client.get_version":
 				poolLog.Debug("get_version request received.")
 				msg := StratumMsg{
@@ -290,18 +302,18 @@ func (s *Stratum) Listen() {
 					continue
 				}
 			}
+
 		case NotifyRes:
 			nResp := resp.(NotifyRes)
 			s.PoolWork.JobID = nResp.JobID
 			s.PoolWork.CB1 = nResp.GenTX1
-			//poolLog.Trace("CB1: " + spew.Sdump(s.PoolWork.CB1))
-			//height := nResp.GenTX1[184:188]
 			heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
 			height, err := strconv.ParseInt(heightHex, 16, 32)
 			if err != nil {
 				poolLog.Tracef("failed to parse height %v", err)
 				height = 0
 			}
+
 			s.PoolWork.Height = height
 			s.PoolWork.CB2 = nResp.GenTX2
 			s.PoolWork.Hash = nResp.Hash
@@ -311,17 +323,20 @@ func (s *Stratum) Listen() {
 			if err != nil {
 				poolLog.Error(err)
 			}
+
 			s.PoolWork.Ntime = nResp.Ntime
 			s.PoolWork.NtimeDelta = parsedNtime - time.Now().Unix()
 			s.PoolWork.Clean = nResp.CleanJobs
 			s.PoolWork.NewWork = true
 			poolLog.Trace("notify: ", spew.Sdump(nResp))
+
 		case *SubscribeReply:
 			nResp := resp.(*SubscribeReply)
 			s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
 			s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
 			poolLog.Info("Subscribe reply received.")
 			poolLog.Trace(spew.Sdump(resp))
+
 		default:
 			poolLog.Info("Unhandled message: ", result)
 		}
@@ -399,7 +414,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		return nil, err
 	}
 	// decode command
-	// Not everyone has a method
+	// Not everyone has a method.
 	err = json.Unmarshal(objmap["method"], &method)
 	if err != nil {
 		method = ""
@@ -497,6 +512,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 			if err != nil {
 				return nil, err
 			}
+
 			for i := 0; i < len(innerMsg); i++ {
 				if innerMsg[i][0] == "mining.notify" {
 					resp.SubscribeID = innerMsg[i][1]
@@ -511,8 +527,8 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 					// we ignore.
 				}
 			}
-
 		}
+
 		resp.ExtraNonce1 = resi[1].(string)
 		resp.ExtraNonce2Length = resi[2].(float64)
 		return resp, nil
@@ -614,6 +630,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		}
 		nres.CleanJobs = cleanJobs
 		return nres, nil
+
 	case "mining.set_difficulty":
 		poolLog.Trace("Received new difficulty.")
 		var resi []interface{}
@@ -636,6 +653,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		nres.Params = params
 		poolLog.Infof("Stratum difficulty set to %v", difficulty)
 		return nres, nil
+
 	case "client.show_message":
 		var resi []interface{}
 		err := json.Unmarshal(objmap["result"], &resi)
@@ -652,6 +670,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		params = append(params, msg)
 		nres.Params = params
 		return nres, nil
+
 	case "client.get_version":
 		var nres = StratumMsg{}
 		var id uint64
@@ -662,6 +681,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		nres.Method = method
 		nres.ID = id
 		return nres, nil
+
 	case "client.reconnect":
 		var nres = StratumMsg{}
 		var id uint64
@@ -700,6 +720,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		nres.Params = []string{hostname, port, wait}
 
 		return nres, nil
+
 	default:
 		resp := &StratumRsp{}
 		err := json.Unmarshal(blob, &resp)
@@ -712,15 +733,15 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 
 // PrepWork converts the stratum notify to getwork style data for mining.
 func (s *Stratum) PrepWork() error {
-
-	// Build final extranonce
+	// Build final extranonce, which is basically the pool user and worker
+	// ID.
 	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
 	if err != nil {
 		poolLog.Error("Error decoding ExtraNonce1.")
 		return err
 	}
-	poolLog.Debugf("en1 %v s.PoolWork.ExtraNonce1 %v", en1, s.PoolWork.ExtraNonce1)
-	// Work out padding
+
+	// Work out padding.
 	tmp := []string{"%0", strconv.Itoa(int(s.PoolWork.ExtraNonce2Length) * 2), "x"}
 	fmtString := strings.Join(tmp, "")
 	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
@@ -728,46 +749,23 @@ func (s *Stratum) PrepWork() error {
 		poolLog.Error("Error decoding ExtraNonce2.")
 		return err
 	}
-	poolLog.Tracef("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
 	extraNonce := append(en1[:], en2[:]...)
-	poolLog.Tracef("extraNonce %v", extraNonce)
-
-	// Increase extranonce2
-	s.PoolWork.ExtraNonce2++
-
-	// Put coinbase transaction together
 
+	// Put coinbase transaction together.
 	cb1, err := hex.DecodeString(s.PoolWork.CB1)
 	if err != nil {
 		poolLog.Error("Error decoding Coinbase pt 1.")
 		return err
 	}
-	poolLog.Debugf("cb1 %v s.PoolWork.CB1 %v", cb1, s.PoolWork.CB1)
-
-	// I've never actually seen a cb2.
-	cb2, err := hex.DecodeString(s.PoolWork.CB2)
-	if err != nil {
-		poolLog.Error("Error decoding Coinbase pt 2.")
-		return err
-	}
-	poolLog.Debugf("cb2 %v s.PoolWork.CB2 %v", cb2, s.PoolWork.CB2)
-
-	cb := append(cb1[:], extraNonce[:]...)
-	cb = append(cb[:], cb2[:]...)
-	poolLog.Debugf("cb %v", cb)
 
-	// Calculate merkle root
-	// I have never seen anything sent in the merkle tree
-	// sent by the pool so not much I can do here.
-	// Confirmed in ccminer code.
-	// Same for StakeRoot
+	// cb2 is never actually sent, so don't try to decode it.
 
-	// Generate current ntime
+	// Generate current ntime.
 	ntime := time.Now().Unix() + s.PoolWork.NtimeDelta
 
-	poolLog.Tracef("ntime: %v", ntime)
+	poolLog.Tracef("ntime: %x", ntime)
 
-	// Serialize header
+	// Serialize header.
 	bh := wire.BlockHeader{}
 	v, err := reverseToInt(s.PoolWork.Version)
 	if err != nil {
@@ -786,17 +784,15 @@ func (s *Stratum) PrepWork() error {
 	t := time.Now().Unix() + s.PoolWork.NtimeDelta
 	bh.Timestamp = time.Unix(t, 0)
 	bh.Nonce = 0
-	// Serialized version
+
+	// Serialized version.
 	blockHeader, err := bh.Bytes()
 	if err != nil {
 		return err
 	}
 
 	data := blockHeader
-	poolLog.Debugf("data0 %v", data)
-	poolLog.Tracef("data len %v", len(data))
 	copy(data[31:139], cb1[0:108])
-	poolLog.Debugf("data1 %v", data)
 
 	var workdata [180]byte
 	workPosition := 0
@@ -807,8 +803,6 @@ func (s *Stratum) PrepWork() error {
 		return err
 	}
 	copy(workdata[workPosition:], version.Bytes())
-	poolLog.Debugf("appended version.Bytes() %v", version.Bytes())
-	poolLog.Tracef("partial workdata (version): %v", hex.EncodeToString(workdata[:]))
 
 	prevHash := revHash(s.PoolWork.Hash)
 	p, err := hex.DecodeString(prevHash)
@@ -819,68 +813,50 @@ func (s *Stratum) PrepWork() error {
 
 	workPosition += 4
 	copy(workdata[workPosition:], p)
-	poolLog.Tracef("partial workdata (previous hash): %v", hex.EncodeToString(workdata[:]))
-	poolLog.Debugf("prevHash %v", prevHash)
-
 	workPosition += 32
 	copy(workdata[workPosition:], cb1[0:108])
-	poolLog.Tracef("partial workdata (cb1): %v", hex.EncodeToString(workdata[:]))
-
 	workPosition += 108
 	copy(workdata[workPosition:], extraNonce)
-	poolLog.Debugf("extranonce: %v", hex.EncodeToString(extraNonce))
-	poolLog.Tracef("partial workdata (extranonce): %v", hex.EncodeToString(workdata[:]))
 
 	var randomBytes = make([]byte, 4)
 	_, err = rand.Read(randomBytes)
 	if err != nil {
 		poolLog.Errorf("Unable to generate random bytes")
+		return err
 	}
 	workPosition += 4
-	// XXX would be nice to enable a static 'random' number here for tests
-	//binary.LittleEndian.PutUint32(randomBytes, 4066485248)
-	poolLog.Tracef("Random data: %v at: %v", randomBytes, workPosition)
-	copy(workdata[workPosition:], randomBytes)
-
-	poolLog.Debugf("workdata len %v", len(workdata))
-	poolLog.Tracef("workdata %v", hex.EncodeToString(workdata[:]))
-
-	var w Work
-	/*var empty = []byte{
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00,
-	}
-	copy(w.Data[:], empty[:])*/
-	copy(w.Data[:], workdata[:])
-	w.Target = s.Target
-	w.Nonce2 = s.PoolWork.Nonce2
-	poolLog.Tracef("final data %v, target %v", hex.EncodeToString(w.Data[:]), w.Target)
-	s.PoolWork.Work = &w
-	return nil
 
+	var workData [192]byte
+	copy(workData[:], workdata[:])
+	givenTs := binary.LittleEndian.Uint32(
+		workData[128+4*timestampWord : 132+4*timestampWord])
+	atomic.StoreUint32(&s.latestJobTime, givenTs)
+
+	w := NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
+
+	poolLog.Tracef("Stratum prepated work data %v, target %032x",
+		hex.EncodeToString(w.Data[:]), w.Target.Bytes())
+	s.PoolWork.Work = w
+
+	return nil
 }
 
 // PrepSubmit formats a mining.sumbit message from the solved work.
 func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
+	poolLog.Debugf("Stratum got valid work to submit %x", data)
+	poolLog.Debugf("Stratum got valid work hash %v",
+		chainhash.HashFuncH(data[0:180]))
+	data2 := make([]byte, 180)
+	copy(data2, data[0:180])
+
 	sub := Submit{}
 	sub.Method = "mining.submit"
 
 	// Format data to send off.
-
 	hexData := hex.EncodeToString(data)
 	decodedData, err := hex.DecodeString(hexData)
 	if err != nil {
-		poolLog.Error("Error decoding data.")
+		poolLog.Error("Error decoding data")
 		return sub, err
 	}
 
@@ -888,58 +864,54 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	bhBuf := bytes.NewReader(decodedData[0:wire.MaxBlockHeaderPayload])
 	err = submittedHeader.Deserialize(bhBuf)
 	if err != nil {
-		poolLog.Error("Error generating header.")
+		poolLog.Error("Error generating header")
 		return sub, err
 	}
 
-	//en2 := strconv.FormatUint(s.PoolWork.ExtraNonce2, 16)
-	nonce := strconv.FormatUint(uint64(submittedHeader.Nonce), 16)
-	time := encodeTime(submittedHeader.Timestamp)
-
-	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
-	if err != nil {
-		poolLog.Error("Error decoding ExtraNonce1.")
-		//return err
-	}
-	poolLog.Tracef("en1 %v s.PoolWork.ExtraNonce1 %v", en1, s.PoolWork.ExtraNonce1)
-	// Work out padding
-	tmp := []string{"%0", strconv.Itoa(int(s.PoolWork.ExtraNonce2Length) * 2), "x"}
-	fmtString := strings.Join(tmp, "")
-	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
-	if err != nil {
-		poolLog.Error("Error decoding ExtraNonce2.")
-		//return err
-	}
-	poolLog.Errorf("en2 %v s.PoolWork.ExtraNonce2 %v", en2, s.PoolWork.ExtraNonce2)
-	extraNonce := append(en1[:], en2[:]...)
-	poolLog.Errorf("extraNonce %v", extraNonce)
-
 	s.ID++
 	sub.ID = s.ID
 	s.submitID = s.ID
 	s.submitted = true
 
-	poolLog.Tracef("ntime %v", s.PoolWork.Ntime)
-
-	poolLog.Tracef("raw User %v JobId %v xnonce2 %v xnonce2length %v time %v nonce %v", s.User, s.PoolWork.JobID, s.PoolWork.ExtraNonce2, s.PoolWork.ExtraNonce2Length, submittedHeader.Timestamp, submittedHeader.Nonce)
+	latestWorkTs := atomic.LoadUint32(&s.latestJobTime)
+	if uint32(submittedHeader.Timestamp.Unix()) != latestWorkTs {
+		return sub, ErrStatumStaleWork
+	}
+
+	// The timestamp string should be:
+	//
+	//   timestampStr := fmt.Sprintf("%08x",
+	//     uint32(submittedHeader.Timestamp.Unix()))
+	//
+	// but the "stratum" protocol appears to only use this value
+	// to check if the miner is in sync with the latest announcement
+	// of work from the pool. If this value is anything other than
+	// the timestamp of the latest pool work timestamp, work gets
+	// rejected from the current implementation.
+	timestampStr := fmt.Sprintf("%08x", latestWorkTs)
+	nonceStr := fmt.Sprintf("%08x", submittedHeader.Nonce)
+	xnonceStr := hex.EncodeToString(data[144:156])
 
-	xnonce2str := hex.EncodeToString(data[144:156])
-
-	poolLog.Tracef("encoded User %v JobId %v xnonce2 %v time %v nonce %v", s.User, s.PoolWork.JobID, xnonce2str, string(time), nonce)
-
-	sub.Params = []string{s.User, s.PoolWork.JobID, xnonce2str, s.PoolWork.Ntime, nonce}
 	// pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr
+	sub.Params = []string{s.User, s.PoolWork.JobID, xnonceStr, timestampStr,
+		nonceStr}
 
 	return sub, nil
 }
 
 // Various helper functions for formatting are below.
 
-func encodeTime(t time.Time) []byte {
-	buf := make([]byte, 8)
-	u := uint64(t.Unix())
-	binary.BigEndian.PutUint64(buf, u)
-	return buf
+// uint32SwapSlice swaps the endianess of a slice of uint32s, swapping only
+// uint32s at a time. The number of bytes in the pointer passed must be a
+// multiple of 4. The underlying slice is modified.
+func uint32SwapSlice(aPtr *[]byte) {
+	a := *aPtr
+	sz := len(a)
+	itrs := sz / 4
+	for i := 0; i < itrs; i++ {
+		a[(i*4)], a[(i*4)+3] = a[(i*4)+3], a[i*4]
+		a[(i*4)+1], a[(i*4)+2] = a[(i*4)+2], a[(i*4)+1]
+	}
 }
 
 func reverseS(s string) (string, error) {
@@ -986,7 +958,9 @@ func revHash(hash string) string {
 	revHash := ""
 	for i := 0; i < 7; i++ {
 		j := i * 8
-		part := fmt.Sprintf("%c%c%c%c%c%c%c%c", hash[6+j], hash[7+j], hash[4+j], hash[5+j], hash[2+j], hash[3+j], hash[0+j], hash[1+j])
+		part := fmt.Sprintf("%c%c%c%c%c%c%c%c",
+			hash[6+j], hash[7+j], hash[4+j], hash[5+j],
+			hash[2+j], hash[3+j], hash[0+j], hash[1+j])
 		revHash += part
 	}
 	return revHash

From 10b02a16ea34cb48b5b30789c08bfe35707c519a Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 21 Jul 2016 22:46:29 -0400
Subject: [PATCH 015/150] Add proxy support for stratum connections.

---
 stratum.go | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/stratum.go b/stratum.go
index c1f8cfd..daf4f2d 100644
--- a/stratum.go
+++ b/stratum.go
@@ -22,6 +22,8 @@ import (
 
 	"github.com/davecgh/go-spew/spew"
 
+	"github.com/btcsuite/go-socks/socks"
+
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/wire"
@@ -144,7 +146,18 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 		err := errors.New("Only stratum pools supported.")
 		return nil, err
 	}
-	conn, err := net.Dial("tcp", pool)
+	var conn net.Conn
+	var err error
+	if cfg.Proxy != "" {
+		proxy := &socks.Proxy{
+			Addr:     cfg.Proxy,
+			Username: cfg.ProxyUser,
+			Password: cfg.ProxyPass,
+		}
+		conn, err = proxy.Dial("tcp", pool)
+	} else {
+		conn, err = net.Dial("tcp", pool)
+	}
 	if err != nil {
 		return nil, err
 	}
@@ -183,7 +196,18 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 
 // Reconnect reconnects to a stratum server if the connection has been lost.
 func (s *Stratum) Reconnect() error {
-	conn, err := net.Dial("tcp", s.Pool)
+	var conn net.Conn
+	var err error
+	if cfg.Proxy != "" {
+		proxy := &socks.Proxy{
+			Addr:     cfg.Proxy,
+			Username: cfg.ProxyUser,
+			Password: cfg.ProxyPass,
+		}
+		conn, err = proxy.Dial("tcp", s.Pool)
+	} else {
+		conn, err = net.Dial("tcp", s.Pool)
+	}
 	if err != nil {
 		return err
 	}

From fc629858090d4339d7f14972954c2f745f762583 Mon Sep 17 00:00:00 2001
From: cjepson <cjepson@decred.org>
Date: Tue, 26 Jul 2016 15:38:01 -0400
Subject: [PATCH 016/150] Fix solo mining.

---
 device.go  |  9 +++++----
 getwork.go | 15 +++++++++++----
 stratum.go |  1 -
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/device.go b/device.go
index 08f6850..94574da 100644
--- a/device.go
+++ b/device.go
@@ -63,13 +63,13 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 
 // NewWork is the constructor for work.
 func NewWork(data [192]byte, target *big.Int, jobTime uint32, timeReceived uint32,
-	isSolo bool) *Work {
+	isGetWork bool) *Work {
 	return &Work{
 		Data:         data,
 		Target:       target,
 		JobTime:      jobTime,
 		TimeReceived: timeReceived,
-		isSolo:       isSolo,
+		isGetWork:    isGetWork,
 	}
 }
 
@@ -78,7 +78,7 @@ type Work struct {
 	Target       *big.Int
 	JobTime      uint32
 	TimeReceived uint32
-	isSolo       bool
+	isGetWork    bool
 }
 
 type Device struct {
@@ -364,7 +364,7 @@ func (d *Device) runDevice() error {
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
 		ts := d.work.JobTime
-		if d.work.isSolo {
+		if d.work.isGetWork {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
@@ -456,6 +456,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	// Construct the final block header.
 	data := make([]byte, 192)
 	copy(data, d.work.Data[:])
+
 	binary.BigEndian.PutUint32(data[128+4*timestampWord:], ts)
 	binary.BigEndian.PutUint32(data[128+4*nonce0Word:], nonce0)
 	binary.BigEndian.PutUint32(data[128+4*nonce1Word:], nonce1)
diff --git a/getwork.go b/getwork.go
index fcb2231..385d7ad 100644
--- a/getwork.go
+++ b/getwork.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"crypto/tls"
 	"crypto/x509"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
@@ -178,13 +179,19 @@ func GetWork() (*Work, error) {
 		return nil, fmt.Errorf("Wrong target length: got %d, expected 32",
 			len(target))
 	}
+
 	bigTarget := new(big.Int)
-	bigTarget.SetString(hex.EncodeToString(target), 16)
+	bigTarget.SetBytes(reverse(target))
+
+	var workData [192]byte
+	copy(workData[:], data)
+	givenTs := binary.LittleEndian.Uint32(
+		workData[128+4*timestampWord : 132+4*timestampWord])
+	w := NewWork(workData, bigTarget, givenTs, uint32(time.Now().Unix()), true)
 
-	var w Work
-	copy(w.Data[:], data)
 	w.Target = bigTarget
-	return &w, nil
+
+	return w, nil
 }
 
 // GetPoolWork gets work from a stratum enabled pool
diff --git a/stratum.go b/stratum.go
index daf4f2d..12fec27 100644
--- a/stratum.go
+++ b/stratum.go
@@ -988,5 +988,4 @@ func revHash(hash string) string {
 		revHash += part
 	}
 	return revHash
-
 }

From 7c4b076ec03cb0032b162ae70f953067354f32fd Mon Sep 17 00:00:00 2001
From: cjepson <cjepson@decred.org>
Date: Tue, 26 Jul 2016 18:14:09 -0400
Subject: [PATCH 017/150] Add device auto-calibration for kernel work size

The miner previously would only take flat kernel work sizes (worksize
argument) or exponential kernel work sizes. Instead of uses explicitly
declared work sizes by default, gominer now calibrates work size
automatically on start up to target some preset amount of time in
milliseconds for kernel execution. This is also able to be tweaked
by the end user by setting the --autocalibrate=n flat manually.
---
 calibrate.go | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++
 config.go    |  43 ++++++++----------
 device.go    |  50 ++++++++++++++++-----
 getwork.go   |   2 +
 miner.go     |  31 ++++++++-----
 stratum.go   |   2 +-
 6 files changed, 202 insertions(+), 48 deletions(-)
 create mode 100644 calibrate.go

diff --git a/calibrate.go b/calibrate.go
new file mode 100644
index 0000000..aa3320e
--- /dev/null
+++ b/calibrate.go
@@ -0,0 +1,122 @@
+// Copyright (c) 2016 The Decred developers.
+
+package main
+
+import (
+	"math"
+	"time"
+	"unsafe"
+
+	"github.com/decred/gominer/cl"
+)
+
+// getKernelExecutionTime returns the kernel execution time for a device.
+func (d *Device) getKernelExecutionTime(globalWorksize uint32) (time.Duration,
+	error) {
+	d.work = Work{}
+
+	minrLog.Tracef("Started GPU #%d: %s for kernel execution time fetch",
+		d.index, d.deviceName)
+	outputData := make([]uint32, outputBufferSize)
+
+	var status cl.CL_int
+
+	// arg 0: pointer to the buffer
+	obuf := d.outputBuffer
+	status = cl.CLSetKernelArg(d.kernel, 0,
+		cl.CL_size_t(unsafe.Sizeof(obuf)),
+		unsafe.Pointer(&obuf))
+	if status != cl.CL_SUCCESS {
+		return time.Duration(0), clError(status, "CLSetKernelArg")
+	}
+
+	// args 1..8: midstate
+	for i := 0; i < 8; i++ {
+		ms := d.midstate[i]
+		status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1),
+			uint32Size, unsafe.Pointer(&ms))
+		if status != cl.CL_SUCCESS {
+			return time.Duration(0), clError(status, "CLSetKernelArg")
+		}
+	}
+
+	// args 9..20: lastBlock except nonce
+	i2 := 0
+	for i := 0; i < 12; i++ {
+		if i2 == nonce0Word {
+			i2++
+		}
+		lb := d.lastBlock[i2]
+		status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9),
+			uint32Size, unsafe.Pointer(&lb))
+		if status != cl.CL_SUCCESS {
+			return time.Duration(0), clError(status, "CLSetKernelArg")
+		}
+		i2++
+	}
+
+	// Clear the found count from the buffer
+	status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer,
+		cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]),
+		0, nil, nil)
+	if status != cl.CL_SUCCESS {
+		return time.Duration(0), clError(status, "CLEnqueueWriteBuffer")
+	}
+
+	// Execute the kernel and follow its execution time.
+	currentTime := time.Now()
+	var globalWorkSize [1]cl.CL_size_t
+	globalWorkSize[0] = cl.CL_size_t(globalWorksize)
+	var localWorkSize [1]cl.CL_size_t
+	localWorkSize[0] = localWorksize
+	status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
+		globalWorkSize[:], localWorkSize[:], 0, nil, nil)
+	if status != cl.CL_SUCCESS {
+		return time.Duration(0), clError(status, "CLEnqueueNDRangeKernel")
+	}
+
+	// Read the output buffer.
+	cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0,
+		uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0,
+		nil, nil)
+	if status != cl.CL_SUCCESS {
+		return time.Duration(0), clError(status, "CLEnqueueReadBuffer")
+	}
+
+	elapsedTime := time.Since(currentTime)
+	minrLog.Tracef("GPU #%d: Kernel execution to read time for work "+
+		"size calibration: %v", d.index, elapsedTime)
+
+	return elapsedTime, nil
+}
+
+// calcWorkSizeForMilliseconds calculates the correct worksize to achieve
+// a GPU execution cycle of the passed duration in milliseconds.
+func (d *Device) calcWorkSizeForMilliseconds(ms int) (uint32, error) {
+	workSize := uint32(1 << 10)
+	timeToAchieve := time.Duration(ms) * time.Millisecond
+	for {
+		execTime, err := d.getKernelExecutionTime(workSize)
+		if err != nil {
+			return 0, err
+		}
+
+		// If we fail to go above the desired execution time, double
+		// the work size and try again.
+		if execTime < timeToAchieve {
+			workSize <<= 1
+			continue
+		}
+
+		// We're passed the desired execution time, so now calculate
+		// what the ideal work size should be.
+		adj := float64(workSize) * (float64(timeToAchieve) / float64(execTime))
+		adj /= 256.0
+		adjMultiple256 := uint32(math.Ceil(adj))
+		workSize = adjMultiple256 * 256
+
+		break
+	}
+
+	return workSize, nil
+}
diff --git a/config.go b/config.go
index d3c1685..800de73 100644
--- a/config.go
+++ b/config.go
@@ -26,14 +26,15 @@ const (
 )
 
 var (
-	minerHomeDir       = dcrutil.AppDataDir("gominer", false)
-	dcrdHomeDir        = dcrutil.AppDataDir("dcrd", false)
-	defaultConfigFile  = filepath.Join(minerHomeDir, defaultConfigFilename)
-	defaultRPCServer   = "localhost"
-	defaultRPCCertFile = filepath.Join(dcrdHomeDir, "rpc.cert")
-	defaultLogDir      = filepath.Join(minerHomeDir, defaultLogDirname)
-	defaultIntensity   = []string{}
-	defaultWorkSize    = []string{}
+	minerHomeDir         = dcrutil.AppDataDir("gominer", false)
+	dcrdHomeDir          = dcrutil.AppDataDir("dcrd", false)
+	defaultConfigFile    = filepath.Join(minerHomeDir, defaultConfigFilename)
+	defaultRPCServer     = "localhost"
+	defaultRPCCertFile   = filepath.Join(dcrdHomeDir, "rpc.cert")
+	defaultLogDir        = filepath.Join(minerHomeDir, defaultLogDirname)
+	defaultAutocalibrate = 500
+	defaultIntensity     = []string{}
+	defaultWorkSize      = []string{}
 
 	// Took these values from cgminer.
 	minIntensity = 8
@@ -71,6 +72,7 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
+	Autocalibrate int      `short:"A" long:"autocalibrate" description:"Use GPU autocalibration to achieve a kernel execution timing of the passed number of milliseconds"`
 	Intensity     []string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device, use multiple flags for multiple devices"`
 	IntensityInts []int
 	WorkSize      []string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity), use multiple flags for multiple devices"`
@@ -225,14 +227,15 @@ func cleanAndExpandPath(path string) string {
 func loadConfig() (*config, []string, error) {
 	// Default config.
 	cfg := config{
-		ConfigFile: defaultConfigFile,
-		DebugLevel: defaultLogLevel,
-		LogDir:     defaultLogDir,
-		RPCServer:  defaultRPCServer,
-		RPCCert:    defaultRPCCertFile,
-		Intensity:  defaultIntensity,
-		ClKernel:   defaultClKernel,
-		WorkSize:   defaultWorkSize,
+		ConfigFile:    defaultConfigFile,
+		DebugLevel:    defaultLogLevel,
+		LogDir:        defaultLogDir,
+		RPCServer:     defaultRPCServer,
+		RPCCert:       defaultRPCCertFile,
+		Autocalibrate: defaultAutocalibrate,
+		Intensity:     defaultIntensity,
+		ClKernel:      defaultClKernel,
+		WorkSize:      defaultWorkSize,
 	}
 
 	// Create the home directory if it doesn't already exist.
@@ -302,14 +305,6 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
-	// The intensity or worksize must be set by the user.
-	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
-		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		err := fmt.Errorf("Intensity or work size must be set")
-		fmt.Fprintln(os.Stderr, err)
-		return nil, nil, err
-	}
-
 	// Check the intensities if the user is setting that.
 	cfg.IntensityInts = make([]int, len(cfg.Intensity))
 	if !reflect.DeepEqual(cfg.Intensity, defaultIntensity) {
diff --git a/device.go b/device.go
index 94574da..fa05d07 100644
--- a/device.go
+++ b/device.go
@@ -1,3 +1,5 @@
+// Copyright (c) 2016 The Decred developers.
+
 package main
 
 import (
@@ -92,6 +94,8 @@ type Device struct {
 	program      cl.CL_program
 	kernel       cl.CL_kernel
 
+	workSize uint32
+
 	// extraNonce is the device extraNonce, where the first
 	// byte is the device ID (supporting up to 255 devices)
 	// while the last 3 bytes is the extraNonce value. If
@@ -224,6 +228,39 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 
 	d.started = uint32(time.Now().Unix())
 
+	// Autocalibrate the desired work size for the kernel, or use one of the
+	// values passed explicitly by the use.
+	// The intensity or worksize must be set by the user.
+	userSetWorkSize := true
+	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
+		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		userSetWorkSize = false
+	}
+
+	var globalWorkSize uint32
+	if !userSetWorkSize {
+		idealWorkSize, err := d.calcWorkSizeForMilliseconds(cfg.Autocalibrate)
+		if err != nil {
+			return nil, err
+		}
+
+		minrLog.Debugf("Autocalibration successful, work size for %v"+
+			"ms per kernel execution on device %v determined to be %v",
+			cfg.Autocalibrate, d.index, idealWorkSize)
+
+		globalWorkSize = idealWorkSize
+	} else {
+		if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+			globalWorkSize = 1 << uint32(cfg.IntensityInts[d.index])
+		} else {
+			globalWorkSize = uint32(cfg.WorkSizeInts[d.index])
+		}
+	}
+	intensity := math.Log2(float64(globalWorkSize))
+	minrLog.Infof("GPU #%d: Work size set to %v ('intensity' %v)",
+		d.index, globalWorkSize, intensity)
+	d.workSize = globalWorkSize
+
 	return d, nil
 }
 
@@ -328,17 +365,6 @@ func (d *Device) testFoundCandidate() {
 func (d *Device) runDevice() error {
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
-	var globalWorksize uint32
-	if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		globalWorksize = 1 << uint32(cfg.IntensityInts[d.index])
-		minrLog.Debugf("GPU #%d: Intensity %v (work size: %v)", d.index,
-			cfg.IntensityInts[d.index], globalWorksize)
-	} else {
-		globalWorksize = uint32(cfg.WorkSizeInts[d.index])
-		intensity := math.Log2(float64(cfg.WorkSizeInts[d.index]))
-		minrLog.Debugf("GPU #%d: Work size: %v ('intensity' %v)", d.index,
-			cfg.WorkSizeInts[d.index], intensity)
-	}
 
 	// Bump the extraNonce for the device it's running on
 	// when you begin mining. This ensures each GPU is doing
@@ -415,7 +441,7 @@ func (d *Device) runDevice() error {
 		// Execute the kernel and follow its execution time.
 		currentTime := time.Now()
 		var globalWorkSize [1]cl.CL_size_t
-		globalWorkSize[0] = cl.CL_size_t(globalWorksize)
+		globalWorkSize[0] = cl.CL_size_t(d.workSize)
 		var localWorkSize [1]cl.CL_size_t
 		localWorkSize[0] = localWorksize
 		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
diff --git a/getwork.go b/getwork.go
index 385d7ad..5f34067 100644
--- a/getwork.go
+++ b/getwork.go
@@ -1,3 +1,5 @@
+// Copyright (c) 2016 The Decred developers.
+
 package main
 
 import (
diff --git a/miner.go b/miner.go
index 29ae241..c47b197 100644
--- a/miner.go
+++ b/miner.go
@@ -1,3 +1,5 @@
+// Copyright (c) 2016 The Decred developers.
+
 package main
 
 import (
@@ -82,17 +84,24 @@ func NewMiner() (*Miner, error) {
 	}
 
 	// Check the number of intensities/work sizes versus the number of devices.
-	if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		if len(cfg.Intensity) != len(deviceIDs) {
-			return nil, fmt.Errorf("Intensities supplied, but number supplied "+
-				"did not match the number of GPUs (got %v, want %v)",
-				len(cfg.Intensity), len(deviceIDs))
-		}
-	} else {
-		if len(cfg.WorkSize) != len(deviceIDs) {
-			return nil, fmt.Errorf("WorkSize supplied, but number supplied "+
-				"did not match the number of GPUs (got %v, want %v)",
-				len(cfg.WorkSize), len(deviceIDs))
+	userSetWorkSize := false
+	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
+		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+		userSetWorkSize = false
+	}
+	if userSetWorkSize {
+		if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
+			if len(cfg.Intensity) != len(deviceIDs) {
+				return nil, fmt.Errorf("Intensities supplied, but number supplied "+
+					"did not match the number of GPUs (got %v, want %v)",
+					len(cfg.Intensity), len(deviceIDs))
+			}
+		} else {
+			if len(cfg.WorkSize) != len(deviceIDs) {
+				return nil, fmt.Errorf("WorkSize supplied, but number supplied "+
+					"did not match the number of GPUs (got %v, want %v)",
+					len(cfg.WorkSize), len(deviceIDs))
+			}
 		}
 	}
 
diff --git a/stratum.go b/stratum.go
index 12fec27..3de1479 100644
--- a/stratum.go
+++ b/stratum.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers
+// Copyright (c) 2016 The Decred developers.
 
 package main
 

From dc6fdccdc241d5fb7f0cde0fd64e55ff2860e143 Mon Sep 17 00:00:00 2001
From: cjepson <cjepson@decred.org>
Date: Wed, 27 Jul 2016 10:50:19 -0400
Subject: [PATCH 018/150] Fix some stratum difficulty bugs

Errors are now thrown if the passed pool difficulty is not a whole number.
---
 stratum.go | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/stratum.go b/stratum.go
index 3de1479..2bf7acf 100644
--- a/stratum.go
+++ b/stratum.go
@@ -12,6 +12,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
 	"math/big"
 	"net"
 	"os"
@@ -175,7 +176,10 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 
 	// Target for share is 1 unless we hear otherwise.
 	stratum.Diff = 1
-	stratum.Target = diffToTarget(stratum.Diff)
+	stratum.Target, err = diffToTarget(stratum.Diff)
+	if err != nil {
+		return nil, err
+	}
 	stratum.PoolWork.NewWork = false
 	stratum.Reader = bufio.NewReader(stratum.Conn)
 	go stratum.Listen()
@@ -667,7 +671,10 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if !ok {
 			return nil, errJsonType
 		}
-		s.Target = diffToTarget(difficulty)
+		s.Target, err = diffToTarget(difficulty)
+		if err != nil {
+			return nil, err
+		}
 		s.Diff = difficulty
 		var nres = StratumMsg{}
 		nres.Method = method
@@ -961,13 +968,23 @@ func reverseToInt(s string) (int32, error) {
 }
 
 // diffToTarget converts a whole number difficulty into a target.
-func diffToTarget(diff float64) *big.Int {
+func diffToTarget(diff float64) (*big.Int, error) {
+	if diff <= 0 {
+		return nil, fmt.Errorf("invalid pool difficulty %v (0 or less than "+
+			"zero passed)", diff)
+	}
+
+	if math.Floor(diff) < diff {
+		return nil, fmt.Errorf("invalid pool difficulty %v (not a whole "+
+			"number)", diff)
+	}
+
 	divisor := new(big.Int).SetInt64(int64(diff))
 	max := chainParams.PowLimit
 	target := new(big.Int)
 	target.Div(max, divisor)
 
-	return target
+	return target, nil
 }
 
 func reverse(src []byte) []byte {

From 1d4c207327d1819882be174665c9875213bbf7aa Mon Sep 17 00:00:00 2001
From: cjepson <cjepson@decred.org>
Date: Wed, 27 Jul 2016 11:59:13 -0400
Subject: [PATCH 019/150] Fix GH/s display

---
 device.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/device.go b/device.go
index fa05d07..98b9c91 100644
--- a/device.go
+++ b/device.go
@@ -522,7 +522,7 @@ func (d *Device) SetWork(w *Work) {
 
 func formatHashrate(h float64) string {
 	if h > 1000000000 {
-		return fmt.Sprintf("%.1fGH/s", h/1000000000)
+		return fmt.Sprintf("%.3fGH/s", h/1000000000)
 	} else if h > 1000000 {
 		return fmt.Sprintf("%.0fMH/s", h/1000000)
 	} else if h > 1000 {

From 13658ed5ddc060360137435593b2760463d2380c Mon Sep 17 00:00:00 2001
From: C Jepson <cjepson@users.noreply.github.com>
Date: Wed, 27 Jul 2016 12:18:03 -0400
Subject: [PATCH 020/150] Correctly increment rejected shares (#31)

Fixes #29.
---
 miner.go   | 8 ++++++--
 stratum.go | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/miner.go b/miner.go
index c47b197..385e7b0 100644
--- a/miner.go
+++ b/miner.go
@@ -155,11 +155,15 @@ func (m *Miner) workSubmitThread() {
 			} else {
 				accepted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
-					if err == ErrStatumStaleWork {
+					switch err {
+					case ErrStatumStaleWork:
 						stale := atomic.LoadUint64(&m.staleShares)
 						stale++
 						atomic.StoreUint64(&m.staleShares, stale)
-					} else {
+
+						minrLog.Debugf("Share submitted to pool was stale")
+
+					default:
 						inval := atomic.LoadUint64(&m.invalidShares)
 						inval++
 						atomic.StoreUint64(&m.invalidShares, inval)
diff --git a/stratum.go b/stratum.go
index 2bf7acf..ac96ec9 100644
--- a/stratum.go
+++ b/stratum.go
@@ -273,7 +273,7 @@ func (s *Stratum) Listen() {
 			}
 			if aResp.ID == s.submitID {
 				if aResp.Result {
-					poolLog.Info("Share Accepted")
+					poolLog.Debugf("Share accepted")
 				} else {
 					poolLog.Error("Share rejected: ", aResp.Error.ErrStr)
 				}

From 44a4882e3a18733f4113d3d80efcc901676488cb Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 27 Jul 2016 12:11:19 -0400
Subject: [PATCH 021/150] Add some build and pool info to readme.

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index efac1bb..91e1d14 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,15 @@ You need to have OpenCL installed. To download and build gominer, run:
 
     go get github.com/decred/gominer
 
+On Ubuntu 16.04 you can install the necessary OpenCL packages (for
+Intel Graphics cards) with
+
+    sudo apt-get install beignet-dev
+
+Other graphics cards will need different libraries.  We have built
+successfully on Ubuntu 16.04 with go1.6.2, g++ 5.4.0 and
+beignet-dev 1.1.1-2 although other combinations should work as well.
+
 ## Running
 
 Run for benchmark:
@@ -15,3 +24,8 @@ Run for benchmark:
 Run for real mining:
 
     gominer -u myusername -P hunter2
+
+To mine on a pool:
+
+    gominer -o stratum+tcp://pool:port -m username -n password
+

From 5b459387914223e0dfe8d5f5cc032fe9e898fd4a Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 27 Jul 2016 11:18:11 -0400
Subject: [PATCH 022/150] Bump to v0.2.0

Switch from alpha to beta while we are at it.
---
 version.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/version.go b/version.go
index f8082e3..e788cbe 100644
--- a/version.go
+++ b/version.go
@@ -31,12 +31,12 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 0
-	appPatch uint = 1
+	appMinor uint = 2
+	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet
 	// per the semantic versioning spec.
-	appPreRelease = "alpha"
+	appPreRelease = "beta"
 )
 
 // appBuild is defined as a variable so it can be overridden during the build

From 1384cdd41d96fa67d55e12b4b544177ba520734e Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 27 Jul 2016 16:37:31 -0400
Subject: [PATCH 023/150] Use glide to manage dependancies.

Close #37
---
 .gitignore |  3 +++
 README.md  | 27 +++++++++++++++++++++------
 glide.lock | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 glide.yaml | 18 ++++++++++++++++++
 4 files changed, 90 insertions(+), 6 deletions(-)
 create mode 100644 glide.lock
 create mode 100644 glide.yaml

diff --git a/.gitignore b/.gitignore
index f58e2ab..8951dcd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,6 @@ kernel/
 
 *~
 gominer
+
+vendor/
+
diff --git a/README.md b/README.md
index 91e1d14..87d48b9 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,22 @@
 
 You need to have OpenCL installed. To download and build gominer, run:
 
-    go get github.com/decred/gominer
+```
+go get -u github.com/Masterminds/glide
+mkdir -p $GOPATH/src/github.com/decred
+cd $GOPATH/src/github.com/decred
+git clone  https://github.com/decred/gominer.git
+cd gominer
+glide i
+go install $(glide nv)
+```
 
 On Ubuntu 16.04 you can install the necessary OpenCL packages (for
 Intel Graphics cards) with
 
-    sudo apt-get install beignet-dev
+```
+sudo apt-get install beignet-dev
+```
 
 Other graphics cards will need different libraries.  We have built
 successfully on Ubuntu 16.04 with go1.6.2, g++ 5.4.0 and
@@ -19,13 +29,18 @@ beignet-dev 1.1.1-2 although other combinations should work as well.
 
 Run for benchmark:
 
-    gominer -B
+```
+gominer -B
+```
 
 Run for real mining:
 
-    gominer -u myusername -P hunter2
+```
+gominer -u myusername -P hunter2
+```
 
 To mine on a pool:
 
-    gominer -o stratum+tcp://pool:port -m username -n password
-
+```
+gominer -o stratum+tcp://pool:port -m username -n password
+```
diff --git a/glide.lock b/glide.lock
new file mode 100644
index 0000000..c474bb5
--- /dev/null
+++ b/glide.lock
@@ -0,0 +1,48 @@
+hash: 91c7f7aacbc4b5f82b14c9de3212c07257421b40c37926f571c3bc79f19c6060
+updated: 2016-07-27T16:32:52.717630149-04:00
+imports:
+- name: github.com/btcsuite/btclog
+  version: f96df2375f37300305f329b8e5258764b4f19a7f
+- name: github.com/btcsuite/fastsha256
+  version: 302ad4db268b46f9ebda3078f6f7397f96047735
+- name: github.com/btcsuite/go-flags
+  version: 6c288d648c1cc1befcb90cb5511dcacf64ae8e61
+- name: github.com/btcsuite/go-socks
+  version: cfe8b59e565c1a5bd4e2005d77cd9aa8b2e14524
+  subpackages:
+  - socks
+- name: github.com/btcsuite/golangcrypto
+  version: 53f62d9b43e87a6c56975cf862af7edf33a8d0df
+  subpackages:
+  - ripemd160
+- name: github.com/btcsuite/seelog
+  version: 313961b101eb55f65ae0f03ddd4e322731763b6c
+- name: github.com/davecgh/go-spew
+  version: 5215b55f46b2b919f50a1df0eaa5886afe4e3b3d
+  subpackages:
+  - spew
+- name: github.com/decred/blake256
+  version: a840e32d7c31fe2e0218607334cb120a683951a4
+- name: github.com/decred/dcrd
+  version: 83110a26ab1c9c7caa2bcdac9d4e1c5fc3192e1d
+  subpackages:
+  - blockchain
+  - chaincfg
+  - chaincfg/chainhash
+  - wire
+  - blockchain/stake
+  - database
+  - txscript
+  - chaincfg/chainec
+  - dcrec/edwards
+  - dcrec/secp256k1
+  - dcrec/secp256k1/schnorr
+- name: github.com/decred/dcrutil
+  version: 4a3bdb1cb08b49811674750998363b8b8ccfd66e
+  subpackages:
+  - base58
+- name: github.com/decred/ed25519
+  version: b0909d3f798b97a03c9e77023f97a5301a2a7900
+  subpackages:
+  - edwards25519
+testImports: []
diff --git a/glide.yaml b/glide.yaml
new file mode 100644
index 0000000..7728430
--- /dev/null
+++ b/glide.yaml
@@ -0,0 +1,18 @@
+package: github.com/decred/gominer
+import:
+- package: github.com/btcsuite/btclog
+- package: github.com/btcsuite/go-flags
+- package: github.com/btcsuite/go-socks
+  subpackages:
+  - socks
+- package: github.com/btcsuite/seelog
+- package: github.com/davecgh/go-spew
+  subpackages:
+  - spew
+- package: github.com/decred/dcrd
+  subpackages:
+  - blockchain
+  - chaincfg
+  - chaincfg/chainhash
+  - wire
+- package: github.com/decred/dcrutil

From 40342220626b40685fa6e7a069cc57cd52d96315 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 27 Jul 2016 17:16:58 -0400
Subject: [PATCH 024/150] Add all options to sample config.

Close #36.
---
 sample-gominer.conf | 49 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/sample-gominer.conf b/sample-gominer.conf
index 9034b24..124f83b 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -1,5 +1,25 @@
 [Application Options]
 
+; ------------------------------------------------------------------------------
+; General settings
+; ------------------------------------------------------------------------------
+;
+
+; Location of logfiles.
+; logdir=/some/path
+
+; Debug logging level.
+; Valid levels are {trace, debug, info, warn, error, critical}
+; You may also specify <subsystem>=<level>,<subsystem2>=<level>,... to set
+; log level for individual subsystems.  Use btcd --debuglevel=show to list
+; available subsystems.
+; debuglevel=info
+
+; Connect via a SOCKS5 proxy.
+; proxy=127.0.0.1:9050
+; proxyuser=
+; proxypass=
+
 ; ------------------------------------------------------------------------------
 ; Network settings
 ; ------------------------------------------------------------------------------
@@ -10,6 +30,14 @@
 ; Use simnet (cannot be used with testnet=1).
 ; simnet=1
 
+; Enable full profile on specified port.
+; profile=1234
+
+; Write cpu profile.
+; cpuprofile=/some/path
+
+; Write memory profile.
+; memprofile=/some/path
 
 ; ------------------------------------------------------------------------------
 ; RPC client settings
@@ -38,4 +66,25 @@
 ; Mining settings
 ; ------------------------------------------------------------------------------
 
+; Location of kernel to use for mining.
+; kernel=./blake256.cl
+
+; Intensity (the work size is 2^intensity) with one entry per device.
 ; intensity=26
+; intensity=25
+
+; Worksize sizes of the work to (overrides intensity) with one entry per device.
+; worksize=33554176
+; worksize=33554176
+
+; Benchmark mode only (do no real work).
+; benchmark=1
+
+; Address of stratum pool to use.
+; pool=stratum+tcp://somepool:port
+
+; Username for mining pool.
+; pooluser=
+
+; Password for mining pool.
+; poolpass=

From 0dac3b1c1ecc88984be6e724934f7fd284943278 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 28 Jul 2016 15:01:26 -0500
Subject: [PATCH 025/150] correct workdata comment (#40)

---
 notify/notify.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notify/notify.go b/notify/notify.go
index f0ca7df..f757169 100644
--- a/notify/notify.go
+++ b/notify/notify.go
@@ -36,7 +36,7 @@ func handleConnection(c net.Conn) {
 	msg4 := `{"id":3,"result":true,"error":null}`
 	msg5 := `{"id":null,"method":"mining.notify","params":["76df","7c3b9a506a98f865820e4c46aaa65cec37f18cf1bf7c508700000ac200000000","a455f69725e9c8623baa3c9c5a708aefb947702dc2b620b4c10129977e104c0275571a5ca5b1308b075fe74224504c9e6b1153f3de97235e7a8c7e58ea8f1c55010086a1d41fb3ee05000000fda400004a33121a2db33e1101000000abae0000260800008ec783570000000000000000","",[],"01000000","1a12334a","5783c78e",true]}`
 	// WorkData generated from that should be:
-	// 010000008ae2a86e4629174b33eb43d7205178823ce70f99bd3d7e24fc04000000000000b25bc74bba24acd4729e61c9f4c53e4f457dc3082d2d28355ae6e6df65e54b4a2040ba54288130410bbcfc548b13711b039bc89f17b3bacb6532bd9001e183f101005c141421c2b80400000097a50000d9f8171ad0357a0f0100000069a3000081460000dbca7657000000000000000000f808120fe43fbb000000000000000000000000000000000000000000000000
+	// 01000000509a3b7c65f8986a464c0e82ec5ca6aaf18cf13787507cbfc20a000000000000a455f69725e9c8623baa3c9c5a708aefb947702dc2b620b4c10129977e104c0275571a5ca5b1308b075fe74224504c9e6b1153f3de97235e7a8c7e58ea8f1c55010086a1d41fb3ee05000000fda400004a33121a2db33e1101000000abae0000260800008ec783570000000000000000009c152de3014335000000000000000000000000000000000000000000000000
 	msg6 := `{"id":4,"result":true,"error":null}`
 
 	reader := bufio.NewReader(c)

From 325db2016d488441d74f6966457479c8a07002cf Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 28 Jul 2016 15:57:18 -0400
Subject: [PATCH 026/150] Reconnect to pool if no usable target is provided.

If a target that we cannot use (<1 for example) is set
do not attempt to do work without the target (which would panic).

Instead, reconnect to pool which gets back the pool's default target.

Closes #41.
---
 stratum.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/stratum.go b/stratum.go
index ac96ec9..fff2440 100644
--- a/stratum.go
+++ b/stratum.go
@@ -863,6 +863,19 @@ func (s *Stratum) PrepWork() error {
 		workData[128+4*timestampWord : 132+4*timestampWord])
 	atomic.StoreUint32(&s.latestJobTime, givenTs)
 
+	if s.Target == nil {
+		poolLog.Errorf("No target set!  Reconnecting to pool.")
+		err = s.Reconnect()
+		if err != nil {
+			poolLog.Error(err)
+			// XXX should just die at this point
+			// but we don't really have access to
+			// the channel to end everything.
+			return err
+		}
+		return nil
+	}
+
 	w := NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
 
 	poolLog.Tracef("Stratum prepated work data %v, target %032x",

From f7ad5dc0412dfa9db87b6b33388f8b7d1595f314 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 29 Jul 2016 07:54:42 -0400
Subject: [PATCH 027/150] Create stratum, work, and util packages.

This commit moves the stratum code to one package,the work type to
another package, and several general use functions to the util
package.

Closes #11
---
 calibrate.go                     |   5 +-
 device.go                        |  65 +++-----
 getwork.go                       |  18 ++-
 log.go                           |   3 +
 miner.go                         |  10 +-
 stratum/log.go                   |  26 +++
 stratum.go => stratum/stratum.go | 265 ++++++++++++-------------------
 util/util.go                     |  77 +++++++++
 work/work.go                     |  37 +++++
 9 files changed, 287 insertions(+), 219 deletions(-)
 create mode 100644 stratum/log.go
 rename stratum.go => stratum/stratum.go (79%)
 create mode 100644 util/util.go
 create mode 100644 work/work.go

diff --git a/calibrate.go b/calibrate.go
index aa3320e..6c410e6 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -8,12 +8,13 @@ import (
 	"unsafe"
 
 	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/work"
 )
 
 // getKernelExecutionTime returns the kernel execution time for a device.
 func (d *Device) getKernelExecutionTime(globalWorksize uint32) (time.Duration,
 	error) {
-	d.work = Work{}
+	d.work = work.Work{}
 
 	minrLog.Tracef("Started GPU #%d: %s for kernel execution time fetch",
 		d.index, d.deviceName)
@@ -43,7 +44,7 @@ func (d *Device) getKernelExecutionTime(globalWorksize uint32) (time.Duration,
 	// args 9..20: lastBlock except nonce
 	i2 := 0
 	for i := 0; i < 12; i++ {
-		if i2 == nonce0Word {
+		if i2 == work.Nonce0Word {
 			i2++
 		}
 		lb := d.lastBlock[i2]
diff --git a/device.go b/device.go
index 98b9c91..c03206f 100644
--- a/device.go
+++ b/device.go
@@ -16,23 +16,22 @@ import (
 	"unsafe"
 
 	"github.com/decred/dcrd/blockchain"
+	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 
 	"github.com/decred/gominer/blake256"
 	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/work"
 )
 
 const (
 	outputBufferSize = cl.CL_size_t(64)
 	localWorksize    = 64
 	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
-
-	timestampWord = 2
-	nonce0Word    = 3
-	nonce1Word    = 4
-	nonce2Word    = 5
 )
 
+var chainParams = &chaincfg.MainNetParams
+
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
 
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
@@ -63,26 +62,6 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	return program_buffer[:], program_size[:], nil
 }
 
-// NewWork is the constructor for work.
-func NewWork(data [192]byte, target *big.Int, jobTime uint32, timeReceived uint32,
-	isGetWork bool) *Work {
-	return &Work{
-		Data:         data,
-		Target:       target,
-		JobTime:      jobTime,
-		TimeReceived: timeReceived,
-		isGetWork:    isGetWork,
-	}
-}
-
-type Work struct {
-	Data         [192]byte
-	Target       *big.Int
-	JobTime      uint32
-	TimeReceived uint32
-	isGetWork    bool
-}
-
 type Device struct {
 	index        int
 	platformID   cl.CL_platform_id
@@ -107,8 +86,8 @@ type Device struct {
 	midstate  [8]uint32
 	lastBlock [16]uint32
 
-	work     Work
-	newWork  chan *Work
+	work     work.Work
+	newWork  chan *work.Work
 	workDone chan []byte
 	hasWork  bool
 
@@ -153,7 +132,7 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 		deviceID:   deviceID,
 		deviceName: getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
 		quit:       make(chan struct{}),
-		newWork:    make(chan *Work, 5),
+		newWork:    make(chan *work.Work, 5),
 		workDone:   workDone,
 	}
 
@@ -273,7 +252,7 @@ func (d *Device) Release() {
 }
 
 func (d *Device) updateCurrentWork() {
-	var w *Work
+	var w *work.Work
 	if d.hasWork {
 		// If we already have work, we just need to check if there's new one
 		// without blocking if there's not.
@@ -299,7 +278,7 @@ func (d *Device) updateCurrentWork() {
 
 	// Bump and set the work ID if the work is new.
 	d.currentWorkID++
-	binary.LittleEndian.PutUint32(d.work.Data[128+4*nonce2Word:],
+	binary.LittleEndian.PutUint32(d.work.Data[128+4*work.Nonce2Word:],
 		d.currentWorkID)
 
 	// Reset the hash state
@@ -371,7 +350,7 @@ func (d *Device) runDevice() error {
 	// different work. If the extraNonce has already been
 	// set for valid work, restore that.
 	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+	d.lastBlock[work.Nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
 
 	var status cl.CL_int
 	for {
@@ -385,16 +364,16 @@ func (d *Device) runDevice() error {
 
 		// Increment extraNonce.
 		rolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
 		ts := d.work.JobTime
-		if d.work.isGetWork {
+		if d.work.IsGetWork {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
-		d.lastBlock[timestampWord] = Uint32EndiannessSwap(ts)
+		d.lastBlock[work.TimestampWord] = Uint32EndiannessSwap(ts)
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
@@ -418,7 +397,7 @@ func (d *Device) runDevice() error {
 		// args 9..20: lastBlock except nonce
 		i2 := 0
 		for i := 0; i < 12; i++ {
-			if i2 == nonce0Word {
+			if i2 == work.Nonce0Word {
 				i2++
 			}
 			lb := d.lastBlock[i2]
@@ -461,15 +440,15 @@ func (d *Device) runDevice() error {
 		for i := uint32(0); i < outputData[0]; i++ {
 			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
-				d.index, i+1, outputData[i+1], d.lastBlock[nonce1Word],
+				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
 				Uint32EndiannessSwap(d.currentWorkID),
-				d.lastBlock[timestampWord])
+				d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
 			// difficulty 1 shares.
-			d.foundCandidate(d.lastBlock[timestampWord], outputData[i+1],
-				d.lastBlock[nonce1Word])
+			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
+				d.lastBlock[work.Nonce1Word])
 		}
 
 		elapsedTime := time.Since(currentTime)
@@ -483,9 +462,9 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	data := make([]byte, 192)
 	copy(data, d.work.Data[:])
 
-	binary.BigEndian.PutUint32(data[128+4*timestampWord:], ts)
-	binary.BigEndian.PutUint32(data[128+4*nonce0Word:], nonce0)
-	binary.BigEndian.PutUint32(data[128+4*nonce1Word:], nonce1)
+	binary.BigEndian.PutUint32(data[128+4*work.TimestampWord:], ts)
+	binary.BigEndian.PutUint32(data[128+4*work.Nonce0Word:], nonce0)
+	binary.BigEndian.PutUint32(data[128+4*work.Nonce1Word:], nonce1)
 	hash := chainhash.HashFuncH(data[0:180])
 
 	// Hashes that reach this logic and fail the minimal proof of
@@ -516,7 +495,7 @@ func (d *Device) Stop() {
 	close(d.quit)
 }
 
-func (d *Device) SetWork(w *Work) {
+func (d *Device) SetWork(w *work.Work) {
 	d.newWork <- w
 }
 
diff --git a/getwork.go b/getwork.go
index 5f34067..5401a15 100644
--- a/getwork.go
+++ b/getwork.go
@@ -18,6 +18,10 @@ import (
 	"time"
 
 	"github.com/btcsuite/go-socks/socks"
+
+	"github.com/decred/gominer/stratum"
+	"github.com/decred/gominer/util"
+	"github.com/decred/gominer/work"
 )
 
 // newHTTPClient returns a new HTTP client that is configured according to the
@@ -113,7 +117,7 @@ func createHTTPClient() *http.Client {
 }
 
 // GetWork makes a getwork RPC call and returns the result (data and target)
-func GetWork() (*Work, error) {
+func GetWork() (*work.Work, error) {
 	// Generate a request to the configured RPC server.
 	protocol := "http"
 	if !cfg.NoTLS {
@@ -183,13 +187,13 @@ func GetWork() (*Work, error) {
 	}
 
 	bigTarget := new(big.Int)
-	bigTarget.SetBytes(reverse(target))
+	bigTarget.SetBytes(util.Reverse(target))
 
 	var workData [192]byte
 	copy(workData[:], data)
 	givenTs := binary.LittleEndian.Uint32(
-		workData[128+4*timestampWord : 132+4*timestampWord])
-	w := NewWork(workData, bigTarget, givenTs, uint32(time.Now().Unix()), true)
+		workData[128+4*work.TimestampWord : 132+4*work.TimestampWord])
+	w := work.NewWork(workData, bigTarget, givenTs, uint32(time.Now().Unix()), true)
 
 	w.Target = bigTarget
 
@@ -197,7 +201,7 @@ func GetWork() (*Work, error) {
 }
 
 // GetPoolWork gets work from a stratum enabled pool
-func GetPoolWork(pool *Stratum) (*Work, error) {
+func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 	// Get Next work for stratum and mark it as used
 	if pool.PoolWork.NewWork {
 		poolLog.Info("Received new work from pool.")
@@ -287,7 +291,7 @@ func GetWorkSubmit(data []byte) (bool, error) {
 }
 
 // GetPoolWorkSubmit sends the result to the stratum enabled pool
-func GetPoolWorkSubmit(data []byte, pool *Stratum) (bool, error) {
+func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
 	sub, err := pool.PrepSubmit(data)
 	if err != nil {
 		return false, err
@@ -310,7 +314,7 @@ func GetPoolWorkSubmit(data []byte, pool *Stratum) (bool, error) {
 		return false, err
 	}
 
-	pool.submitted = true
+	pool.Submitted = true
 
 	return true, nil
 }
diff --git a/log.go b/log.go
index a5a79d8..588dc21 100644
--- a/log.go
+++ b/log.go
@@ -6,6 +6,8 @@ import (
 
 	"github.com/btcsuite/btclog"
 	"github.com/btcsuite/seelog"
+
+	"github.com/decred/gominer/stratum"
 )
 
 var (
@@ -36,6 +38,7 @@ func useLogger(subsystemID string, logger btclog.Logger) {
 		minrLog = logger
 	case "POOL":
 		poolLog = logger
+		stratum.UseLogger(logger)
 	}
 }
 
diff --git a/miner.go b/miner.go
index 385e7b0..ea7c040 100644
--- a/miner.go
+++ b/miner.go
@@ -10,6 +10,8 @@ import (
 	"time"
 
 	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/stratum"
+	"github.com/decred/gominer/work"
 )
 
 func getCLPlatforms() ([]cl.CL_platform_id, error) {
@@ -49,7 +51,7 @@ type Miner struct {
 	quit             chan struct{}
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
-	pool             *Stratum
+	pool             *stratum.Stratum
 
 	started       uint32
 	validShares   uint64
@@ -66,7 +68,7 @@ func NewMiner() (*Miner, error) {
 
 	// If needed, start pool code.
 	if cfg.Pool != "" && !cfg.Benchmark {
-		s, err := StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword)
+		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version())
 		if err != nil {
 			return nil, err
 		}
@@ -156,7 +158,7 @@ func (m *Miner) workSubmitThread() {
 				accepted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
 					switch err {
-					case ErrStatumStaleWork:
+					case stratum.ErrStatumStaleWork:
 						stale := atomic.LoadUint64(&m.staleShares)
 						stale++
 						atomic.StoreUint64(&m.staleShares, stale)
@@ -278,7 +280,7 @@ func (m *Miner) Run() {
 
 	if cfg.Benchmark {
 		minrLog.Warn("Running in BENCHMARK mode! No real mining taking place!")
-		work := &Work{}
+		work := &work.Work{}
 		for _, d := range m.devices {
 			d.SetWork(work)
 		}
diff --git a/stratum/log.go b/stratum/log.go
new file mode 100644
index 0000000..a256fcb
--- /dev/null
+++ b/stratum/log.go
@@ -0,0 +1,26 @@
+// Copyright (c) 2013-2015 The btcsuite developers
+// Copyright (c) 2016 The Decred developers
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+
+package stratum
+
+import "github.com/btcsuite/btclog"
+
+// log is a logger that is initialized with no output filters.  This
+// means the package will not perform any logging by default until the caller
+// requests it.
+var log = btclog.Disabled
+
+// DisableLog disables all library log output.  Logging output is disabled
+// by default until either UseLogger or SetLogWriter are called.
+func DisableLog() {
+	log = btclog.Disabled
+}
+
+// UseLogger uses a specified Logger to output package logging info.
+// This should be used in preference to SetLogWriter if the caller is also
+// using btclog.
+func UseLogger(logger btclog.Logger) {
+	log = logger
+}
diff --git a/stratum.go b/stratum/stratum.go
similarity index 79%
rename from stratum.go
rename to stratum/stratum.go
index fff2440..4c8ef26 100644
--- a/stratum.go
+++ b/stratum/stratum.go
@@ -1,6 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
-package main
+package stratum
 
 import (
 	"bufio"
@@ -12,7 +12,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"math"
 	"math/big"
 	"net"
 	"os"
@@ -28,6 +27,9 @@ import (
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/wire"
+
+	"github.com/decred/gominer/util"
+	"github.com/decred/gominer/work"
 )
 
 var chainParams = &chaincfg.MainNetParams
@@ -38,9 +40,7 @@ var ErrStatumStaleWork = fmt.Errorf("Stale work, throwing away")
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
 type Stratum struct {
-	Pool          string
-	User          string
-	Pass          string
+	cfg           Config
 	Conn          net.Conn
 	Reader        *bufio.Reader
 	ID            uint64
@@ -49,11 +49,22 @@ type Stratum struct {
 	submitID      uint64
 	Diff          float64
 	Target        *big.Int
-	submitted     bool
+	Submitted     bool
 	PoolWork      NotifyWork
 	latestJobTime uint32
 }
 
+// Config holdes the config options that may be used by a stratum pool.
+type Config struct {
+	Pool      string
+	User      string
+	Pass      string
+	Proxy     string
+	ProxyUser string
+	ProxyPass string
+	Version   string
+}
+
 // NotifyWork holds all the info recieved from a mining.notify message along
 // with the Work data generate from it.
 type NotifyWork struct {
@@ -72,7 +83,7 @@ type NotifyWork struct {
 	Ntime             string
 	Version           string
 	NewWork           bool
-	Work              *Work
+	Work              *work.Work
 }
 
 // StratumMsg is the basic message object from stratum.
@@ -138,8 +149,16 @@ var errJsonType = errors.New("Unexpected type in json.")
 
 // StratumConn starts the initial connection to a stratum pool and sets defaults
 // in the pool object.
-func StratumConn(pool, user, pass string) (*Stratum, error) {
-	poolLog.Infof("Using pool: %v", pool)
+func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string) (*Stratum, error) {
+	var stratum Stratum
+	stratum.cfg.User = user
+	stratum.cfg.Pass = pass
+	stratum.cfg.Proxy = proxy
+	stratum.cfg.ProxyUser = proxyUser
+	stratum.cfg.ProxyPass = proxyPass
+	stratum.cfg.Version = version
+
+	log.Infof("Using pool: %v", pool)
 	proto := "stratum+tcp://"
 	if strings.HasPrefix(pool, proto) {
 		pool = strings.Replace(pool, proto, "", 1)
@@ -149,11 +168,11 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 	}
 	var conn net.Conn
 	var err error
-	if cfg.Proxy != "" {
+	if stratum.cfg.Proxy != "" {
 		proxy := &socks.Proxy{
-			Addr:     cfg.Proxy,
-			Username: cfg.ProxyUser,
-			Password: cfg.ProxyPass,
+			Addr:     stratum.cfg.Proxy,
+			Username: stratum.cfg.ProxyUser,
+			Password: stratum.cfg.ProxyPass,
 		}
 		conn, err = proxy.Dial("tcp", pool)
 	} else {
@@ -162,12 +181,9 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 	if err != nil {
 		return nil, err
 	}
-	var stratum Stratum
 	stratum.ID = 1
 	stratum.Conn = conn
-	stratum.Pool = pool
-	stratum.User = user
-	stratum.Pass = pass
+	stratum.cfg.Pool = pool
 
 	// We will set it for sure later but this really should be the value and
 	// setting it here will prevent so incorrect matches based on the
@@ -176,7 +192,7 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 
 	// Target for share is 1 unless we hear otherwise.
 	stratum.Diff = 1
-	stratum.Target, err = diffToTarget(stratum.Diff)
+	stratum.Target, err = util.DiffToTarget(stratum.Diff, chainParams.PowLimit)
 	if err != nil {
 		return nil, err
 	}
@@ -202,15 +218,15 @@ func StratumConn(pool, user, pass string) (*Stratum, error) {
 func (s *Stratum) Reconnect() error {
 	var conn net.Conn
 	var err error
-	if cfg.Proxy != "" {
+	if s.cfg.Proxy != "" {
 		proxy := &socks.Proxy{
-			Addr:     cfg.Proxy,
-			Username: cfg.ProxyUser,
-			Password: cfg.ProxyPass,
+			Addr:     s.cfg.Proxy,
+			Username: s.cfg.ProxyUser,
+			Password: s.cfg.ProxyPass,
 		}
-		conn, err = proxy.Dial("tcp", s.Pool)
+		conn, err = proxy.Dial("tcp", s.cfg.Pool)
 	} else {
-		conn, err = net.Dial("tcp", s.Pool)
+		conn, err = net.Dial("tcp", s.cfg.Pool)
 	}
 	if err != nil {
 		return err
@@ -233,31 +249,31 @@ func (s *Stratum) Reconnect() error {
 
 // Listen is the listener for the incoming messages from the stratum pool.
 func (s *Stratum) Listen() {
-	poolLog.Debug("Starting Listener")
+	log.Debug("Starting Listener")
 
 	for {
 		result, err := s.Reader.ReadString('\n')
 		if err != nil {
 			if err == io.EOF {
-				poolLog.Error("Connection lost!  Reconnecting.")
+				log.Error("Connection lost!  Reconnecting.")
 				err = s.Reconnect()
 				if err != nil {
-					poolLog.Error(err)
-					poolLog.Error("Reconnect failed.")
+					log.Error(err)
+					log.Error("Reconnect failed.")
 					os.Exit(1)
 					return
 				}
 
 			} else {
-				poolLog.Error(err)
+				log.Error(err)
 			}
 			continue
 		}
 
-		poolLog.Debug(strings.TrimSuffix(result, "\n"))
+		log.Debug(strings.TrimSuffix(result, "\n"))
 		resp, err := s.Unmarshal([]byte(result))
 		if err != nil {
-			poolLog.Error(err)
+			log.Error(err)
 			continue
 		}
 
@@ -266,41 +282,41 @@ func (s *Stratum) Listen() {
 			aResp := resp.(*BasicReply)
 			if int(aResp.ID.(uint64)) == int(s.authID) {
 				if aResp.Result {
-					poolLog.Info("Logged in")
+					log.Info("Logged in")
 				} else {
-					poolLog.Error("Auth failure.")
+					log.Error("Auth failure.")
 				}
 			}
 			if aResp.ID == s.submitID {
 				if aResp.Result {
-					poolLog.Debugf("Share accepted")
+					log.Debugf("Share accepted")
 				} else {
-					poolLog.Error("Share rejected: ", aResp.Error.ErrStr)
+					log.Error("Share rejected: ", aResp.Error.ErrStr)
 				}
-				s.submitted = false
+				s.Submitted = false
 			}
 
 		case StratumMsg:
 			nResp := resp.(StratumMsg)
-			poolLog.Trace(nResp)
+			log.Trace(nResp)
 			// Too much is still handled in unmarshaler.  Need to
 			// move stuff other than unmarshalling here.
 			switch nResp.Method {
 			case "client.show_message":
-				poolLog.Info(nResp.Params)
+				log.Info(nResp.Params)
 			case "client.reconnect":
-				poolLog.Info("Reconnect requested")
+				log.Info("Reconnect requested")
 				wait, err := strconv.Atoi(nResp.Params[2])
 				if err != nil {
-					poolLog.Error(err)
+					log.Error(err)
 					continue
 				}
 				time.Sleep(time.Duration(wait) * time.Second)
 				pool := nResp.Params[0] + ":" + nResp.Params[1]
-				s.Pool = pool
+				s.cfg.Pool = pool
 				err = s.Reconnect()
 				if err != nil {
-					poolLog.Error(err)
+					log.Error(err)
 					// XXX should just die at this point
 					// but we don't really have access to
 					// the channel to end everything.
@@ -308,25 +324,25 @@ func (s *Stratum) Listen() {
 				}
 
 			case "client.get_version":
-				poolLog.Debug("get_version request received.")
+				log.Debug("get_version request received.")
 				msg := StratumMsg{
 					Method: nResp.Method,
 					ID:     nResp.ID,
-					Params: []string{"decred-gominer/" + version()},
+					Params: []string{"decred-gominer/" + s.cfg.Version},
 				}
 				m, err := json.Marshal(msg)
 				if err != nil {
-					poolLog.Error(err)
+					log.Error(err)
 					continue
 				}
 				_, err = s.Conn.Write(m)
 				if err != nil {
-					poolLog.Error(err)
+					log.Error(err)
 					continue
 				}
 				_, err = s.Conn.Write([]byte("\n"))
 				if err != nil {
-					poolLog.Error(err)
+					log.Error(err)
 					continue
 				}
 			}
@@ -338,7 +354,7 @@ func (s *Stratum) Listen() {
 			heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
 			height, err := strconv.ParseInt(heightHex, 16, 32)
 			if err != nil {
-				poolLog.Tracef("failed to parse height %v", err)
+				log.Tracef("failed to parse height %v", err)
 				height = 0
 			}
 
@@ -349,24 +365,24 @@ func (s *Stratum) Listen() {
 			s.PoolWork.Version = nResp.BlockVersion
 			parsedNtime, err := strconv.ParseInt(nResp.Ntime, 16, 64)
 			if err != nil {
-				poolLog.Error(err)
+				log.Error(err)
 			}
 
 			s.PoolWork.Ntime = nResp.Ntime
 			s.PoolWork.NtimeDelta = parsedNtime - time.Now().Unix()
 			s.PoolWork.Clean = nResp.CleanJobs
 			s.PoolWork.NewWork = true
-			poolLog.Trace("notify: ", spew.Sdump(nResp))
+			log.Trace("notify: ", spew.Sdump(nResp))
 
 		case *SubscribeReply:
 			nResp := resp.(*SubscribeReply)
 			s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
 			s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
-			poolLog.Info("Subscribe reply received.")
-			poolLog.Trace(spew.Sdump(resp))
+			log.Info("Subscribe reply received.")
+			log.Trace(spew.Sdump(resp))
 
 		default:
-			poolLog.Info("Unhandled message: ", result)
+			log.Info("Unhandled message: ", result)
 		}
 	}
 }
@@ -376,7 +392,7 @@ func (s *Stratum) Auth() error {
 	msg := StratumMsg{
 		Method: "mining.authorize",
 		ID:     s.ID,
-		Params: []string{s.User, s.Pass},
+		Params: []string{s.cfg.User, s.cfg.Pass},
 	}
 	// Auth reply has no method so need a way to identify it.
 	// Ugly, but not much choise.
@@ -386,7 +402,7 @@ func (s *Stratum) Auth() error {
 	}
 	s.authID = id
 	s.ID += 1
-	poolLog.Tracef("> %v", msg)
+	log.Tracef("> %v", msg)
 	m, err := json.Marshal(msg)
 	if err != nil {
 		return err
@@ -407,7 +423,7 @@ func (s *Stratum) Subscribe() error {
 	msg := StratumMsg{
 		Method: "mining.subscribe",
 		ID:     s.ID,
-		Params: []string{"decred-gominer/" + version()},
+		Params: []string{"decred-gominer/" + s.cfg.Version},
 	}
 	s.subID = msg.ID.(uint64)
 	s.ID++
@@ -415,7 +431,7 @@ func (s *Stratum) Subscribe() error {
 	if err != nil {
 		return err
 	}
-	poolLog.Tracef("> %v", string(m))
+	log.Tracef("> %v", string(m))
 	_, err = s.Conn.Write(m)
 	if err != nil {
 		return err
@@ -451,7 +467,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 	if err != nil {
 		return nil, err
 	}
-	poolLog.Trace("Received: method: ", method, " id: ", id)
+	log.Trace("Received: method: ", method, " id: ", id)
 	if id == s.authID {
 		var (
 			objmap      map[string]json.RawMessage
@@ -503,7 +519,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if err != nil {
 			return nil, err
 		}
-		poolLog.Trace(resi)
+		log.Trace(resi)
 		resp := &SubscribeReply{}
 
 		var objmap2 map[string]json.RawMessage
@@ -561,7 +577,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		resp.ExtraNonce2Length = resi[2].(float64)
 		return resp, nil
 	}
-	if id == s.submitID && s.submitted {
+	if id == s.submitID && s.Submitted {
 		var (
 			objmap      map[string]json.RawMessage
 			id          uint64
@@ -607,13 +623,13 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 	}
 	switch method {
 	case "mining.notify":
-		poolLog.Trace("Unmarshal mining.notify")
+		log.Trace("Unmarshal mining.notify")
 		var resi []interface{}
 		err := json.Unmarshal(objmap["params"], &resi)
 		if err != nil {
 			return nil, err
 		}
-		poolLog.Trace(resi)
+		log.Trace(resi)
 		var nres = NotifyRes{}
 		jobID, ok := resi[0].(string)
 		if !ok {
@@ -660,7 +676,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		return nres, nil
 
 	case "mining.set_difficulty":
-		poolLog.Trace("Received new difficulty.")
+		log.Trace("Received new difficulty.")
 		var resi []interface{}
 		err := json.Unmarshal(objmap["params"], &resi)
 		if err != nil {
@@ -671,7 +687,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if !ok {
 			return nil, errJsonType
 		}
-		s.Target, err = diffToTarget(difficulty)
+		s.Target, err = util.DiffToTarget(difficulty, chainParams.PowLimit)
 		if err != nil {
 			return nil, err
 		}
@@ -682,7 +698,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		var params []string
 		params = append(params, diffStr)
 		nres.Params = params
-		poolLog.Infof("Stratum difficulty set to %v", difficulty)
+		log.Infof("Stratum difficulty set to %v", difficulty)
 		return nres, nil
 
 	case "client.show_message":
@@ -728,7 +744,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if err != nil {
 			return nil, err
 		}
-		poolLog.Trace(resi)
+		log.Trace(resi)
 
 		if len(resi) < 3 {
 			return nil, errJsonType
@@ -768,7 +784,7 @@ func (s *Stratum) PrepWork() error {
 	// ID.
 	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
 	if err != nil {
-		poolLog.Error("Error decoding ExtraNonce1.")
+		log.Error("Error decoding ExtraNonce1.")
 		return err
 	}
 
@@ -777,7 +793,7 @@ func (s *Stratum) PrepWork() error {
 	fmtString := strings.Join(tmp, "")
 	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
 	if err != nil {
-		poolLog.Error("Error decoding ExtraNonce2.")
+		log.Error("Error decoding ExtraNonce2.")
 		return err
 	}
 	extraNonce := append(en1[:], en2[:]...)
@@ -785,7 +801,7 @@ func (s *Stratum) PrepWork() error {
 	// Put coinbase transaction together.
 	cb1, err := hex.DecodeString(s.PoolWork.CB1)
 	if err != nil {
-		poolLog.Error("Error decoding Coinbase pt 1.")
+		log.Error("Error decoding Coinbase pt 1.")
 		return err
 	}
 
@@ -794,11 +810,11 @@ func (s *Stratum) PrepWork() error {
 	// Generate current ntime.
 	ntime := time.Now().Unix() + s.PoolWork.NtimeDelta
 
-	poolLog.Tracef("ntime: %x", ntime)
+	log.Tracef("ntime: %x", ntime)
 
 	// Serialize header.
 	bh := wire.BlockHeader{}
-	v, err := reverseToInt(s.PoolWork.Version)
+	v, err := util.ReverseToInt(s.PoolWork.Version)
 	if err != nil {
 		return err
 	}
@@ -806,7 +822,7 @@ func (s *Stratum) PrepWork() error {
 
 	nbits, err := hex.DecodeString(s.PoolWork.Nbits)
 	if err != nil {
-		poolLog.Error("Error decoding nbits")
+		log.Error("Error decoding nbits")
 		return err
 	}
 
@@ -835,10 +851,10 @@ func (s *Stratum) PrepWork() error {
 	}
 	copy(workdata[workPosition:], version.Bytes())
 
-	prevHash := revHash(s.PoolWork.Hash)
+	prevHash := util.RevHash(s.PoolWork.Hash)
 	p, err := hex.DecodeString(prevHash)
 	if err != nil {
-		poolLog.Error("Error encoding previous hash.")
+		log.Error("Error encoding previous hash.")
 		return err
 	}
 
@@ -852,7 +868,7 @@ func (s *Stratum) PrepWork() error {
 	var randomBytes = make([]byte, 4)
 	_, err = rand.Read(randomBytes)
 	if err != nil {
-		poolLog.Errorf("Unable to generate random bytes")
+		log.Errorf("Unable to generate random bytes")
 		return err
 	}
 	workPosition += 4
@@ -860,14 +876,14 @@ func (s *Stratum) PrepWork() error {
 	var workData [192]byte
 	copy(workData[:], workdata[:])
 	givenTs := binary.LittleEndian.Uint32(
-		workData[128+4*timestampWord : 132+4*timestampWord])
+		workData[128+4*work.TimestampWord : 132+4*work.TimestampWord])
 	atomic.StoreUint32(&s.latestJobTime, givenTs)
 
 	if s.Target == nil {
-		poolLog.Errorf("No target set!  Reconnecting to pool.")
+		log.Errorf("No target set!  Reconnecting to pool.")
 		err = s.Reconnect()
 		if err != nil {
-			poolLog.Error(err)
+			log.Error(err)
 			// XXX should just die at this point
 			// but we don't really have access to
 			// the channel to end everything.
@@ -876,9 +892,9 @@ func (s *Stratum) PrepWork() error {
 		return nil
 	}
 
-	w := NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
+	w := work.NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
 
-	poolLog.Tracef("Stratum prepated work data %v, target %032x",
+	log.Tracef("Stratum prepated work data %v, target %032x",
 		hex.EncodeToString(w.Data[:]), w.Target.Bytes())
 	s.PoolWork.Work = w
 
@@ -887,8 +903,8 @@ func (s *Stratum) PrepWork() error {
 
 // PrepSubmit formats a mining.sumbit message from the solved work.
 func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
-	poolLog.Debugf("Stratum got valid work to submit %x", data)
-	poolLog.Debugf("Stratum got valid work hash %v",
+	log.Debugf("Stratum got valid work to submit %x", data)
+	log.Debugf("Stratum got valid work hash %v",
 		chainhash.HashFuncH(data[0:180]))
 	data2 := make([]byte, 180)
 	copy(data2, data[0:180])
@@ -900,7 +916,7 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	hexData := hex.EncodeToString(data)
 	decodedData, err := hex.DecodeString(hexData)
 	if err != nil {
-		poolLog.Error("Error decoding data")
+		log.Error("Error decoding data")
 		return sub, err
 	}
 
@@ -908,14 +924,14 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	bhBuf := bytes.NewReader(decodedData[0:wire.MaxBlockHeaderPayload])
 	err = submittedHeader.Deserialize(bhBuf)
 	if err != nil {
-		poolLog.Error("Error generating header")
+		log.Error("Error generating header")
 		return sub, err
 	}
 
 	s.ID++
 	sub.ID = s.ID
 	s.submitID = s.ID
-	s.submitted = true
+	s.Submitted = true
 
 	latestWorkTs := atomic.LoadUint32(&s.latestJobTime)
 	if uint32(submittedHeader.Timestamp.Unix()) != latestWorkTs {
@@ -937,85 +953,8 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	xnonceStr := hex.EncodeToString(data[144:156])
 
 	// pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr
-	sub.Params = []string{s.User, s.PoolWork.JobID, xnonceStr, timestampStr,
+	sub.Params = []string{s.cfg.User, s.PoolWork.JobID, xnonceStr, timestampStr,
 		nonceStr}
 
 	return sub, nil
 }
-
-// Various helper functions for formatting are below.
-
-// uint32SwapSlice swaps the endianess of a slice of uint32s, swapping only
-// uint32s at a time. The number of bytes in the pointer passed must be a
-// multiple of 4. The underlying slice is modified.
-func uint32SwapSlice(aPtr *[]byte) {
-	a := *aPtr
-	sz := len(a)
-	itrs := sz / 4
-	for i := 0; i < itrs; i++ {
-		a[(i*4)], a[(i*4)+3] = a[(i*4)+3], a[i*4]
-		a[(i*4)+1], a[(i*4)+2] = a[(i*4)+2], a[(i*4)+1]
-	}
-}
-
-func reverseS(s string) (string, error) {
-	a := strings.Split(s, "")
-	sRev := ""
-	if len(a)%2 != 0 {
-		return "", fmt.Errorf("Incorrect input length")
-	}
-	for i := 0; i < len(a); i += 2 {
-		tmp := []string{a[i], a[i+1], sRev}
-		sRev = strings.Join(tmp, "")
-	}
-	return sRev, nil
-}
-
-func reverseToInt(s string) (int32, error) {
-	sRev, err := reverseS(s)
-	if err != nil {
-		return 0, err
-	}
-	i, err := strconv.ParseInt(sRev, 10, 32)
-	return int32(i), err
-}
-
-// diffToTarget converts a whole number difficulty into a target.
-func diffToTarget(diff float64) (*big.Int, error) {
-	if diff <= 0 {
-		return nil, fmt.Errorf("invalid pool difficulty %v (0 or less than "+
-			"zero passed)", diff)
-	}
-
-	if math.Floor(diff) < diff {
-		return nil, fmt.Errorf("invalid pool difficulty %v (not a whole "+
-			"number)", diff)
-	}
-
-	divisor := new(big.Int).SetInt64(int64(diff))
-	max := chainParams.PowLimit
-	target := new(big.Int)
-	target.Div(max, divisor)
-
-	return target, nil
-}
-
-func reverse(src []byte) []byte {
-	dst := make([]byte, len(src))
-	for i := len(src); i > 0; i-- {
-		dst[len(src)-i] = src[i-1]
-	}
-	return dst
-}
-
-func revHash(hash string) string {
-	revHash := ""
-	for i := 0; i < 7; i++ {
-		j := i * 8
-		part := fmt.Sprintf("%c%c%c%c%c%c%c%c",
-			hash[6+j], hash[7+j], hash[4+j], hash[5+j],
-			hash[2+j], hash[3+j], hash[0+j], hash[1+j])
-		revHash += part
-	}
-	return revHash
-}
diff --git a/util/util.go b/util/util.go
new file mode 100644
index 0000000..43710d5
--- /dev/null
+++ b/util/util.go
@@ -0,0 +1,77 @@
+// Copyright (c) 2016 The Decred developers.
+
+package util
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"strconv"
+	"strings"
+)
+
+// Reverse reverses a byte array.
+func Reverse(src []byte) []byte {
+	dst := make([]byte, len(src))
+	for i := len(src); i > 0; i-- {
+		dst[len(src)-i] = src[i-1]
+	}
+	return dst
+}
+
+// reverseS reverses a hex string.
+func reverseS(s string) (string, error) {
+	a := strings.Split(s, "")
+	sRev := ""
+	if len(a)%2 != 0 {
+		return "", fmt.Errorf("Incorrect input length")
+	}
+	for i := 0; i < len(a); i += 2 {
+		tmp := []string{a[i], a[i+1], sRev}
+		sRev = strings.Join(tmp, "")
+	}
+	return sRev, nil
+}
+
+// ReverseToInt reverse a string and converts to int32.
+func ReverseToInt(s string) (int32, error) {
+	sRev, err := reverseS(s)
+	if err != nil {
+		return 0, err
+	}
+	i, err := strconv.ParseInt(sRev, 10, 32)
+	return int32(i), err
+}
+
+// RevHash reverses a hash in string format.
+func RevHash(hash string) string {
+	revHash := ""
+	for i := 0; i < 7; i++ {
+		j := i * 8
+		part := fmt.Sprintf("%c%c%c%c%c%c%c%c",
+			hash[6+j], hash[7+j], hash[4+j], hash[5+j],
+			hash[2+j], hash[3+j], hash[0+j], hash[1+j])
+		revHash += part
+	}
+	return revHash
+}
+
+// DiffToTarget converts a whole number difficulty into a target.
+func DiffToTarget(diff float64, powLimit *big.Int) (*big.Int, error) {
+	if diff <= 0 {
+		return nil, fmt.Errorf("invalid pool difficulty %v (0 or less than "+
+			"zero passed)", diff)
+	}
+
+	if math.Floor(diff) < diff {
+		return nil, fmt.Errorf("invalid pool difficulty %v (not a whole "+
+			"number)", diff)
+	}
+
+	divisor := new(big.Int).SetInt64(int64(diff))
+	max := powLimit
+	target := new(big.Int)
+	target.Div(max, divisor)
+
+	return target, nil
+}
diff --git a/work/work.go b/work/work.go
new file mode 100644
index 0000000..d8b220d
--- /dev/null
+++ b/work/work.go
@@ -0,0 +1,37 @@
+// Copyright (c) 2016 The Decred developers.
+
+package work
+
+import (
+	"math/big"
+)
+
+// These are the locations of various data inside Work.Data.
+const (
+	TimestampWord = 2
+	Nonce0Word    = 3
+	Nonce1Word    = 4
+	Nonce2Word    = 5
+)
+
+// NewWork is the constructor for Work.
+func NewWork(data [192]byte, target *big.Int, jobTime uint32, timeReceived uint32,
+	isGetWork bool) *Work {
+	return &Work{
+		Data:         data,
+		Target:       target,
+		JobTime:      jobTime,
+		TimeReceived: timeReceived,
+		IsGetWork:    isGetWork,
+	}
+}
+
+// Work holds the data returned from getwork and if needed some stratum related
+// values.
+type Work struct {
+	Data         [192]byte
+	Target       *big.Int
+	JobTime      uint32
+	TimeReceived uint32
+	IsGetWork    bool
+}

From c9427567df02d85c4fdcd3b4415fdc3cab392777 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 3 Aug 2016 09:43:29 -0400
Subject: [PATCH 028/150] Fix benchmark mode.

This prevents a panic when comparing against the non-existant
target in benchmark mode.

Also supress output related to shares in benchmark mode.

Fixes #45
---
 device.go | 20 +++++++++++---------
 miner.go  | 22 ++++++++++++----------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/device.go b/device.go
index c03206f..05ad048 100644
--- a/device.go
+++ b/device.go
@@ -479,15 +479,17 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 		d.allDiffOneShares++
 	}
 
-	// Assess versus the pool or daemon target.
-	if hashNum.Cmp(d.work.Target) > 0 {
-		minrLog.Debugf("GPU #%d: Hash %v bigger than target %032x (boo)",
-			d.index, hash, d.work.Target.Bytes())
-	} else {
-		minrLog.Infof("GPU #%d: Found hash with work below target! %v (yay)",
-			d.index, hash)
-		d.validShares++
-		d.workDone <- data
+	if !cfg.Benchmark {
+		// Assess versus the pool or daemon target.
+		if hashNum.Cmp(d.work.Target) > 0 {
+			minrLog.Debugf("GPU #%d: Hash %v bigger than target %032x (boo)",
+				d.index, hash, d.work.Target.Bytes())
+		} else {
+			minrLog.Infof("GPU #%d: Found hash with work below target! %v (yay)",
+				d.index, hash)
+			d.validShares++
+			d.workDone <- data
+		}
 	}
 }
 
diff --git a/miner.go b/miner.go
index ea7c040..7dd7a5d 100644
--- a/miner.go
+++ b/miner.go
@@ -239,16 +239,18 @@ func (m *Miner) printStatsThread() {
 	defer t.Stop()
 
 	for {
-		valid := atomic.LoadUint64(&m.validShares)
-		minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
-			valid,
-			atomic.LoadUint64(&m.invalidShares),
-			atomic.LoadUint64(&m.staleShares))
-
-		secondsElapsed := uint32(time.Now().Unix()) - m.started
-		if (secondsElapsed / 60) > 0 {
-			utility := float64(valid) / (float64(secondsElapsed) / float64(60))
-			minrLog.Infof("Global utility (accepted shares/min): %v", utility)
+		if !cfg.Benchmark {
+			valid := atomic.LoadUint64(&m.validShares)
+			minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
+				valid,
+				atomic.LoadUint64(&m.invalidShares),
+				atomic.LoadUint64(&m.staleShares))
+
+			secondsElapsed := uint32(time.Now().Unix()) - m.started
+			if (secondsElapsed / 60) > 0 {
+				utility := float64(valid) / (float64(secondsElapsed) / float64(60))
+				minrLog.Infof("Global utility (accepted shares/min): %v", utility)
+			}
 		}
 		for _, d := range m.devices {
 			d.PrintStats()

From eef7ec093d37da0adcc9f8ca0c10774f264c5f89 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 5 Aug 2016 08:54:02 -0400
Subject: [PATCH 029/150] Add mutexes to protect various data from races.

While there, break up Stratum.Listen function into a few smaller
functions.  Makes it easier to follow and easier to user defer for the
sync.Unlock() calls.

Closes #33
---
 device.go          |   6 ++
 getwork.go         |   2 +
 miner.go           |   4 +
 stratum/stratum.go | 217 +++++++++++++++++++++++++--------------------
 4 files changed, 131 insertions(+), 98 deletions(-)

diff --git a/device.go b/device.go
index 05ad048..6d9272f 100644
--- a/device.go
+++ b/device.go
@@ -12,6 +12,7 @@ import (
 	"math/big"
 	"os"
 	"reflect"
+	"sync"
 	"time"
 	"unsafe"
 
@@ -63,6 +64,7 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 }
 
 type Device struct {
+	sync.Mutex
 	index        int
 	platformID   cl.CL_platform_id
 	deviceID     cl.CL_device_id
@@ -458,6 +460,8 @@ func (d *Device) runDevice() error {
 }
 
 func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
+	d.Lock()
+	defer d.Unlock()
 	// Construct the final block header.
 	data := make([]byte, 192)
 	copy(data, d.work.Data[:])
@@ -546,6 +550,8 @@ func (d *Device) PrintStats() {
 	}
 
 	diffOneShareHashesAvg := uint64(0x00000000FFFFFFFF)
+	d.Lock()
+	defer d.Unlock()
 	averageHashRate := (float64(diffOneShareHashesAvg) *
 		float64(d.allDiffOneShares)) /
 		float64(secondsElapsed)
diff --git a/getwork.go b/getwork.go
index 5401a15..27ac8e2 100644
--- a/getwork.go
+++ b/getwork.go
@@ -292,6 +292,8 @@ func GetWorkSubmit(data []byte) (bool, error) {
 
 // GetPoolWorkSubmit sends the result to the stratum enabled pool
 func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
+	pool.Lock()
+	defer pool.Unlock()
 	sub, err := pool.PrepSubmit(data)
 	if err != nil {
 		return false, err
diff --git a/miner.go b/miner.go
index 7dd7a5d..21dc22c 100644
--- a/miner.go
+++ b/miner.go
@@ -212,8 +212,10 @@ func (m *Miner) workRefreshThread() {
 				}
 			}
 		} else {
+			m.pool.Lock()
 			if m.pool.PoolWork.NewWork {
 				work, err := GetPoolWork(m.pool)
+				m.pool.Unlock()
 				if err != nil {
 					minrLog.Errorf("Error in getpoolwork: %v", err)
 				} else {
@@ -221,6 +223,8 @@ func (m *Miner) workRefreshThread() {
 						d.SetWork(work)
 					}
 				}
+			} else {
+				m.pool.Unlock()
 			}
 		}
 		select {
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 4c8ef26..2022b2a 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -17,6 +17,7 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"time"
 
@@ -40,6 +41,7 @@ var ErrStatumStaleWork = fmt.Errorf("Stale work, throwing away")
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
 type Stratum struct {
+	sync.Mutex
 	cfg           Config
 	Conn          net.Conn
 	Reader        *bufio.Reader
@@ -279,114 +281,131 @@ func (s *Stratum) Listen() {
 
 		switch resp.(type) {
 		case *BasicReply:
-			aResp := resp.(*BasicReply)
-			if int(aResp.ID.(uint64)) == int(s.authID) {
-				if aResp.Result {
-					log.Info("Logged in")
-				} else {
-					log.Error("Auth failure.")
-				}
-			}
-			if aResp.ID == s.submitID {
-				if aResp.Result {
-					log.Debugf("Share accepted")
-				} else {
-					log.Error("Share rejected: ", aResp.Error.ErrStr)
-				}
-				s.Submitted = false
-			}
-
+			s.handleBasicReply(resp)
 		case StratumMsg:
-			nResp := resp.(StratumMsg)
-			log.Trace(nResp)
-			// Too much is still handled in unmarshaler.  Need to
-			// move stuff other than unmarshalling here.
-			switch nResp.Method {
-			case "client.show_message":
-				log.Info(nResp.Params)
-			case "client.reconnect":
-				log.Info("Reconnect requested")
-				wait, err := strconv.Atoi(nResp.Params[2])
-				if err != nil {
-					log.Error(err)
-					continue
-				}
-				time.Sleep(time.Duration(wait) * time.Second)
-				pool := nResp.Params[0] + ":" + nResp.Params[1]
-				s.cfg.Pool = pool
-				err = s.Reconnect()
-				if err != nil {
-					log.Error(err)
-					// XXX should just die at this point
-					// but we don't really have access to
-					// the channel to end everything.
-					return
-				}
-
-			case "client.get_version":
-				log.Debug("get_version request received.")
-				msg := StratumMsg{
-					Method: nResp.Method,
-					ID:     nResp.ID,
-					Params: []string{"decred-gominer/" + s.cfg.Version},
-				}
-				m, err := json.Marshal(msg)
-				if err != nil {
-					log.Error(err)
-					continue
-				}
-				_, err = s.Conn.Write(m)
-				if err != nil {
-					log.Error(err)
-					continue
-				}
-				_, err = s.Conn.Write([]byte("\n"))
-				if err != nil {
-					log.Error(err)
-					continue
-				}
-			}
-
+			s.handleStratumMsg(resp)
 		case NotifyRes:
-			nResp := resp.(NotifyRes)
-			s.PoolWork.JobID = nResp.JobID
-			s.PoolWork.CB1 = nResp.GenTX1
-			heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
-			height, err := strconv.ParseInt(heightHex, 16, 32)
-			if err != nil {
-				log.Tracef("failed to parse height %v", err)
-				height = 0
-			}
+			s.handleNotifyRes(resp)
+		case *SubscribeReply:
+			s.handleSubscribeReply(resp)
+		default:
+			log.Info("Unhandled message: ", result)
+		}
+	}
+}
 
-			s.PoolWork.Height = height
-			s.PoolWork.CB2 = nResp.GenTX2
-			s.PoolWork.Hash = nResp.Hash
-			s.PoolWork.Nbits = nResp.Nbits
-			s.PoolWork.Version = nResp.BlockVersion
-			parsedNtime, err := strconv.ParseInt(nResp.Ntime, 16, 64)
-			if err != nil {
-				log.Error(err)
-			}
+func (s *Stratum) handleBasicReply(resp interface{}) {
+	s.Lock()
+	defer s.Unlock()
+	aResp := resp.(*BasicReply)
 
-			s.PoolWork.Ntime = nResp.Ntime
-			s.PoolWork.NtimeDelta = parsedNtime - time.Now().Unix()
-			s.PoolWork.Clean = nResp.CleanJobs
-			s.PoolWork.NewWork = true
-			log.Trace("notify: ", spew.Sdump(nResp))
+	if int(aResp.ID.(uint64)) == int(s.authID) {
+		if aResp.Result {
+			log.Info("Logged in")
+		} else {
+			log.Error("Auth failure.")
+		}
+	}
+	if aResp.ID == s.submitID {
+		if aResp.Result {
+			log.Debugf("Share accepted")
+		} else {
+			log.Error("Share rejected: ", aResp.Error.ErrStr)
+		}
+		s.Submitted = false
+	}
+}
 
-		case *SubscribeReply:
-			nResp := resp.(*SubscribeReply)
-			s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
-			s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
-			log.Info("Subscribe reply received.")
-			log.Trace(spew.Sdump(resp))
+func (s *Stratum) handleStratumMsg(resp interface{}) {
+	nResp := resp.(StratumMsg)
+	log.Trace(nResp)
+	// Too much is still handled in unmarshaler.  Need to
+	// move stuff other than unmarshalling here.
+	switch nResp.Method {
+	case "client.show_message":
+		log.Info(nResp.Params)
+	case "client.reconnect":
+		log.Info("Reconnect requested")
+		wait, err := strconv.Atoi(nResp.Params[2])
+		if err != nil {
+			log.Error(err)
+			return
+		}
+		time.Sleep(time.Duration(wait) * time.Second)
+		pool := nResp.Params[0] + ":" + nResp.Params[1]
+		s.cfg.Pool = pool
+		err = s.Reconnect()
+		if err != nil {
+			log.Error(err)
+			// XXX should just die at this point
+			// but we don't really have access to
+			// the channel to end everything.
+			return
+		}
 
-		default:
-			log.Info("Unhandled message: ", result)
+	case "client.get_version":
+		log.Debug("get_version request received.")
+		msg := StratumMsg{
+			Method: nResp.Method,
+			ID:     nResp.ID,
+			Params: []string{"decred-gominer/" + s.cfg.Version},
+		}
+		m, err := json.Marshal(msg)
+		if err != nil {
+			log.Error(err)
+			return
+		}
+		_, err = s.Conn.Write(m)
+		if err != nil {
+			log.Error(err)
+			return
+		}
+		_, err = s.Conn.Write([]byte("\n"))
+		if err != nil {
+			log.Error(err)
+			return
 		}
 	}
 }
 
+func (s *Stratum) handleNotifyRes(resp interface{}) {
+	s.Lock()
+	defer s.Unlock()
+	nResp := resp.(NotifyRes)
+	s.PoolWork.JobID = nResp.JobID
+	s.PoolWork.CB1 = nResp.GenTX1
+	heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
+	height, err := strconv.ParseInt(heightHex, 16, 32)
+	if err != nil {
+		log.Tracef("failed to parse height %v", err)
+		height = 0
+	}
+
+	s.PoolWork.Height = height
+	s.PoolWork.CB2 = nResp.GenTX2
+	s.PoolWork.Hash = nResp.Hash
+	s.PoolWork.Nbits = nResp.Nbits
+	s.PoolWork.Version = nResp.BlockVersion
+	parsedNtime, err := strconv.ParseInt(nResp.Ntime, 16, 64)
+	if err != nil {
+		log.Error(err)
+	}
+
+	s.PoolWork.Ntime = nResp.Ntime
+	s.PoolWork.NtimeDelta = parsedNtime - time.Now().Unix()
+	s.PoolWork.Clean = nResp.CleanJobs
+	s.PoolWork.NewWork = true
+	log.Trace("notify: ", spew.Sdump(nResp))
+}
+
+func (s *Stratum) handleSubscribeReply(resp interface{}) {
+	nResp := resp.(*SubscribeReply)
+	s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
+	s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
+	log.Info("Subscribe reply received.")
+	log.Trace(spew.Sdump(resp))
+}
+
 // Auth sends a message to the pool to authorize a worker.
 func (s *Stratum) Auth() error {
 	msg := StratumMsg{
@@ -447,6 +466,8 @@ func (s *Stratum) Subscribe() error {
 // I'm sure a lot of this can be generalized but the json we deal with
 // is pretty yucky.
 func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
+	s.Lock()
+	defer s.Unlock()
 	var (
 		objmap map[string]json.RawMessage
 		method string

From 8c9e667c7298ac3cf3b3265cd4ed2650cb77cc06 Mon Sep 17 00:00:00 2001
From: Jonathan Chappelow <chappjc@users.noreply.github.com>
Date: Mon, 8 Aug 2016 08:26:27 -0700
Subject: [PATCH 030/150] Fix check for userSetWorkSize. (#51)

The check for userSetWorkSize in miner.go was broken, causing the check for number if input intensities and worksizes to fail to detect a mismatch.
---
 miner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miner.go b/miner.go
index 21dc22c..f861fcd 100644
--- a/miner.go
+++ b/miner.go
@@ -86,7 +86,7 @@ func NewMiner() (*Miner, error) {
 	}
 
 	// Check the number of intensities/work sizes versus the number of devices.
-	userSetWorkSize := false
+	userSetWorkSize := true
 	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
 		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
 		userSetWorkSize = false

From ac5f94a54d2c4398acd08028a8111bb2069c9137 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 8 Aug 2016 15:54:15 -0400
Subject: [PATCH 031/150] Clean up logging.

This includes fixing logging levels, removing some redundant logs,
making some output a little more uniform, and not logging pool info
when solo mining.

Closes #21
---
 getwork.go         |  4 ++--
 main.go            |  2 +-
 miner.go           |  2 +-
 stratum/stratum.go | 17 +++++++----------
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/getwork.go b/getwork.go
index 27ac8e2..c916ddf 100644
--- a/getwork.go
+++ b/getwork.go
@@ -204,7 +204,7 @@ func GetWork() (*work.Work, error) {
 func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 	// Get Next work for stratum and mark it as used
 	if pool.PoolWork.NewWork {
-		poolLog.Info("Received new work from pool.")
+		poolLog.Debug("Received new work from pool.")
 		// Mark used
 		pool.PoolWork.NewWork = false
 
@@ -306,7 +306,7 @@ func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
 	}
 
 	// Send.
-	poolLog.Tracef("> %s", m)
+	poolLog.Tracef("%s", m)
 	_, err = pool.Conn.Write(m)
 	if err != nil {
 		return false, err
diff --git a/main.go b/main.go
index edefb21..1e8b33b 100644
--- a/main.go
+++ b/main.go
@@ -82,7 +82,7 @@ func gominerMain() error {
 	signal.Notify(c, os.Interrupt)
 	go func() {
 		<-c
-		mainLog.Info("Got Control+C, exiting...")
+		mainLog.Warn("Got Control+C, exiting...")
 		m.Stop()
 	}()
 
diff --git a/miner.go b/miner.go
index f861fcd..19904fc 100644
--- a/miner.go
+++ b/miner.go
@@ -243,7 +243,7 @@ func (m *Miner) printStatsThread() {
 	defer t.Stop()
 
 	for {
-		if !cfg.Benchmark {
+		if cfg.Pool != "" && !cfg.Benchmark {
 			valid := atomic.LoadUint64(&m.validShares)
 			minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
 				valid,
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 2022b2a..2428202 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -206,8 +206,6 @@ func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string)
 	if err != nil {
 		return nil, err
 	}
-	// Should NOT need this.
-	//time.Sleep(5 * time.Second)
 	err = stratum.Auth()
 	if err != nil {
 		return nil, err
@@ -301,14 +299,14 @@ func (s *Stratum) handleBasicReply(resp interface{}) {
 
 	if int(aResp.ID.(uint64)) == int(s.authID) {
 		if aResp.Result {
-			log.Info("Logged in")
+			log.Debug("Logged in")
 		} else {
 			log.Error("Auth failure.")
 		}
 	}
 	if aResp.ID == s.submitID {
 		if aResp.Result {
-			log.Debugf("Share accepted")
+			log.Debug("Share accepted")
 		} else {
 			log.Error("Share rejected: ", aResp.Error.ErrStr)
 		}
@@ -325,7 +323,7 @@ func (s *Stratum) handleStratumMsg(resp interface{}) {
 	case "client.show_message":
 		log.Info(nResp.Params)
 	case "client.reconnect":
-		log.Info("Reconnect requested")
+		log.Debug("Reconnect requested")
 		wait, err := strconv.Atoi(nResp.Params[2])
 		if err != nil {
 			log.Error(err)
@@ -377,7 +375,7 @@ func (s *Stratum) handleNotifyRes(resp interface{}) {
 	heightHex := nResp.GenTX1[186:188] + nResp.GenTX1[184:186]
 	height, err := strconv.ParseInt(heightHex, 16, 32)
 	if err != nil {
-		log.Tracef("failed to parse height %v", err)
+		log.Debugf("failed to parse height %v", err)
 		height = 0
 	}
 
@@ -402,7 +400,7 @@ func (s *Stratum) handleSubscribeReply(resp interface{}) {
 	nResp := resp.(*SubscribeReply)
 	s.PoolWork.ExtraNonce1 = nResp.ExtraNonce1
 	s.PoolWork.ExtraNonce2Length = nResp.ExtraNonce2Length
-	log.Info("Subscribe reply received.")
+	log.Debug("Subscribe reply received.")
 	log.Trace(spew.Sdump(resp))
 }
 
@@ -421,7 +419,7 @@ func (s *Stratum) Auth() error {
 	}
 	s.authID = id
 	s.ID += 1
-	log.Tracef("> %v", msg)
+	log.Tracef("%v", msg)
 	m, err := json.Marshal(msg)
 	if err != nil {
 		return err
@@ -450,7 +448,7 @@ func (s *Stratum) Subscribe() error {
 	if err != nil {
 		return err
 	}
-	log.Tracef("> %v", string(m))
+	log.Tracef("%v", string(m))
 	_, err = s.Conn.Write(m)
 	if err != nil {
 		return err
@@ -650,7 +648,6 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if err != nil {
 			return nil, err
 		}
-		log.Trace(resi)
 		var nres = NotifyRes{}
 		jobID, ok := resi[0].(string)
 		if !ok {

From 3d96ce30d6fa9ffae1f3638f73e617bbaf6649c8 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 10 Aug 2016 10:38:58 -0400
Subject: [PATCH 032/150] Reorganize some functions/packages.

Moved several generic functions out of device.go and
into util package.

Moved opencl functions out of miner and into device.go.
---
 device.go    | 74 +++++++++++++++++++++++++++-------------------------
 miner.go     | 32 -----------------------
 util/util.go | 31 ++++++++++++++++++++++
 3 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/device.go b/device.go
index 6d9272f..62d386f 100644
--- a/device.go
+++ b/device.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/decred/gominer/blake256"
 	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
@@ -35,6 +36,37 @@ var chainParams = &chaincfg.MainNetParams
 
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
 
+func getCLPlatforms() ([]cl.CL_platform_id, error) {
+	var numPlatforms cl.CL_uint
+	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	platforms := make([]cl.CL_platform_id, numPlatforms)
+	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	return platforms, nil
+}
+
+// getCLDevices returns the list of devices for the given platform.
+func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
+	var numDevices cl.CL_uint
+	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_GPU, 0, nil,
+		&numDevices)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	devices := make([]cl.CL_device_id, numDevices)
+	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
+		devices, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	return devices, nil
+}
+
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	var program_buffer [1][]byte
 	var program_size [1]cl.CL_size_t
@@ -101,22 +133,6 @@ type Device struct {
 	quit chan struct{}
 }
 
-// Uint32EndiannessSwap swaps the endianness of a uint32.
-func Uint32EndiannessSwap(v uint32) uint32 {
-	return (v&0x000000FF)<<24 | (v&0x0000FF00)<<8 |
-		(v&0x00FF0000)>>8 | (v&0xFF000000)>>24
-}
-
-// rolloverExtraNonce rolls over the extraNonce if it goes over 0x00FFFFFF many
-// hashes, since the first byte is reserved for the ID.
-func rolloverExtraNonce(v *uint32) {
-	if *v&0x00FFFFFF == 0x00FFFFFF {
-		*v = *v & 0xFF000000
-	} else {
-		*v++
-	}
-}
-
 func clError(status cl.CL_int, f string) error {
 	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
 		return fmt.Errorf("%s returned unknown error!")
@@ -352,7 +368,7 @@ func (d *Device) runDevice() error {
 	// different work. If the extraNonce has already been
 	// set for valid work, restore that.
 	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[work.Nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
 
 	var status cl.CL_int
 	for {
@@ -365,8 +381,8 @@ func (d *Device) runDevice() error {
 		}
 
 		// Increment extraNonce.
-		rolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = Uint32EndiannessSwap(d.extraNonce)
+		util.RolloverExtraNonce(&d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -375,7 +391,7 @@ func (d *Device) runDevice() error {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
-		d.lastBlock[work.TimestampWord] = Uint32EndiannessSwap(ts)
+		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
@@ -443,7 +459,7 @@ func (d *Device) runDevice() error {
 			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				Uint32EndiannessSwap(d.currentWorkID),
+				util.Uint32EndiannessSwap(d.currentWorkID),
 				d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
@@ -505,20 +521,6 @@ func (d *Device) SetWork(w *work.Work) {
 	d.newWork <- w
 }
 
-func formatHashrate(h float64) string {
-	if h > 1000000000 {
-		return fmt.Sprintf("%.3fGH/s", h/1000000000)
-	} else if h > 1000000 {
-		return fmt.Sprintf("%.0fMH/s", h/1000000)
-	} else if h > 1000 {
-		return fmt.Sprintf("%.1fkH/s", h/1000)
-	} else if h == 0 {
-		return "0H/s"
-	}
-
-	return fmt.Sprintf("%.1f GH/s", h)
-}
-
 func getDeviceInfo(id cl.CL_device_id,
 	name cl.CL_device_info,
 	str string) string {
@@ -559,7 +561,7 @@ func (d *Device) PrintStats() {
 	minrLog.Infof("GPU #%d (%s) reporting average hash rate %v, %v/%v valid work",
 		d.index,
 		d.deviceName,
-		formatHashrate(averageHashRate),
+		util.FormatHashRate(averageHashRate),
 		d.validShares,
 		d.validShares+d.invalidShares)
 }
diff --git a/miner.go b/miner.go
index 19904fc..d0de333 100644
--- a/miner.go
+++ b/miner.go
@@ -9,42 +9,10 @@ import (
 	"sync/atomic"
 	"time"
 
-	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/stratum"
 	"github.com/decred/gominer/work"
 )
 
-func getCLPlatforms() ([]cl.CL_platform_id, error) {
-	var numPlatforms cl.CL_uint
-	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetPlatformIDs")
-	}
-	platforms := make([]cl.CL_platform_id, numPlatforms)
-	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetPlatformIDs")
-	}
-	return platforms, nil
-}
-
-// getCLDevices returns the list of devices for the given platform.
-func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
-	var numDevices cl.CL_uint
-	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_GPU, 0, nil,
-		&numDevices)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetDeviceIDs")
-	}
-	devices := make([]cl.CL_device_id, numDevices)
-	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
-		devices, nil)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetDeviceIDs")
-	}
-	return devices, nil
-}
-
 type Miner struct {
 	devices          []*Device
 	workDone         chan []byte
diff --git a/util/util.go b/util/util.go
index 43710d5..6217f54 100644
--- a/util/util.go
+++ b/util/util.go
@@ -75,3 +75,34 @@ func DiffToTarget(diff float64, powLimit *big.Int) (*big.Int, error) {
 
 	return target, nil
 }
+
+// RolloverExtraNonce rolls over the extraNonce if it goes over 0x00FFFFFF many
+// hashes, since the first byte is reserved for the ID.
+func RolloverExtraNonce(v *uint32) {
+	if *v&0x00FFFFFF == 0x00FFFFFF {
+		*v = *v & 0xFF000000
+	} else {
+		*v++
+	}
+}
+
+// Uint32EndiannessSwap swaps the endianness of a uint32.
+func Uint32EndiannessSwap(v uint32) uint32 {
+	return (v&0x000000FF)<<24 | (v&0x0000FF00)<<8 |
+		(v&0x00FF0000)>>8 | (v&0xFF000000)>>24
+}
+
+// FormatHashRate sets the units properly when displaying a hashrate.
+func FormatHashRate(h float64) string {
+	if h > 1000000000 {
+		return fmt.Sprintf("%.3fGH/s", h/1000000000)
+	} else if h > 1000000 {
+		return fmt.Sprintf("%.0fMH/s", h/1000000)
+	} else if h > 1000 {
+		return fmt.Sprintf("%.1fkH/s", h/1000)
+	} else if h == 0 {
+		return "0H/s"
+	}
+
+	return fmt.Sprintf("%.1f GH/s", h)
+}

From 8e8f392a09348dfe01aad9b5d950157e088d9033 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Wed, 10 Aug 2016 10:31:06 -0500
Subject: [PATCH 033/150] add device selection/restriction (#54)

---
 config.go | 41 ++++++++++++++++++++++++++++++++++++++++-
 device.go | 20 ++++++++++++++++++++
 miner.go  | 32 ++++++++++++++++++++++++++++++--
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/config.go b/config.go
index 800de73..8c31f28 100644
--- a/config.go
+++ b/config.go
@@ -43,6 +43,7 @@ var (
 )
 
 type config struct {
+	ListDevices bool `short:"l" long:"listdevices" description:"List number of devices."`
 	ShowVersion bool `short:"V" long:"version" description:"Display version information and exit"`
 
 	// Config / log options
@@ -72,7 +73,9 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
-	Autocalibrate int      `short:"A" long:"autocalibrate" description:"Use GPU autocalibration to achieve a kernel execution timing of the passed number of milliseconds"`
+	Autocalibrate int    `short:"A" long:"autocalibrate" description:"Use GPU autocalibration to achieve a kernel execution timing of the passed number of milliseconds"`
+	Devices       string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
+	DeviceIDs     []int
 	Intensity     []string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device, use multiple flags for multiple devices"`
 	IntensityInts []int
 	WorkSize      []string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity), use multiple flags for multiple devices"`
@@ -262,6 +265,11 @@ func loadConfig() (*config, []string, error) {
 	appName := filepath.Base(os.Args[0])
 	appName = strings.TrimSuffix(appName, filepath.Ext(appName))
 	usageMessage := fmt.Sprintf("Use %s -h to show usage", appName)
+	if preCfg.ListDevices {
+		ListDevices()
+		os.Exit(0)
+	}
+
 	if preCfg.ShowVersion {
 		fmt.Println(appName, "version", version())
 		os.Exit(0)
@@ -305,6 +313,37 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
+	if len(cfg.Devices) > 0 {
+		// Parse a list like -D 1,2
+		if strings.Contains(cfg.Devices, ",") {
+			specifiedDevices := strings.Split(cfg.Devices, ",")
+			cfg.DeviceIDs = make([]int, len(specifiedDevices))
+			for i := range specifiedDevices {
+				j, err := strconv.Atoi(specifiedDevices[i])
+				if err != nil {
+					err := fmt.Errorf("Could not convert device number %v "+
+						"(%v) to int: %s", i+1, specifiedDevices[i], err.Error())
+					fmt.Fprintln(os.Stderr, err)
+					return nil, nil, err
+				}
+
+				cfg.DeviceIDs[i] = j
+			}
+			// Use specified device like -D 1
+		} else {
+			cfg.DeviceIDs = make([]int, 1)
+			i, err := strconv.Atoi(cfg.Devices)
+			if err != nil {
+				err := fmt.Errorf("Could not convert specified device %v "+
+					"to int: %s", cfg.Devices, err.Error())
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+
+			cfg.DeviceIDs[0] = i
+		}
+	}
+
 	// Check the intensities if the user is setting that.
 	cfg.IntensityInts = make([]int, len(cfg.Intensity))
 	if !reflect.DeepEqual(cfg.Intensity, defaultIntensity) {
diff --git a/device.go b/device.go
index 62d386f..6974c5e 100644
--- a/device.go
+++ b/device.go
@@ -142,6 +142,26 @@ func clError(status cl.CL_int, f string) error {
 		cl.ERROR_CODES_STRINGS[-status], status)
 }
 
+// ListDevices prints a list of GPUs present.
+func ListDevices() {
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Could not get CL platforms: %v\n", err)
+		os.Exit(1)
+	}
+
+	platformID := platformIDs[0]
+	deviceIDs, err := getCLDevices(platformID)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
+		os.Exit(1)
+	}
+
+	for i, deviceID := range deviceIDs {
+		fmt.Printf("GPU #%d: %s\n", i, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
+	}
+}
+
 func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
 	workDone chan []byte) (*Device, error) {
 	d := &Device{
diff --git a/miner.go b/miner.go
index d0de333..1743401 100644
--- a/miner.go
+++ b/miner.go
@@ -48,11 +48,31 @@ func NewMiner() (*Miner, error) {
 		return nil, fmt.Errorf("Could not get CL platforms: %v", err)
 	}
 	platformID := platformIDs[0]
-	deviceIDs, err := getCLDevices(platformID)
+	CLdeviceIDs, err := getCLDevices(platformID)
 	if err != nil {
 		return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
 	}
 
+	var deviceIDs []cl.CL_device_id
+
+	// Enforce device restrictions if they exist
+	if len(cfg.DeviceIDs) > 0 {
+		for _, i := range cfg.DeviceIDs {
+			var found = false
+			for j, CLdeviceID := range CLdeviceIDs {
+				if i == j {
+					deviceIDs = append(deviceIDs, CLdeviceID)
+					found = true
+				}
+			}
+			if !found {
+				return nil, fmt.Errorf("Unable to find GPU #%d", i)
+			}
+		}
+	} else {
+		copy(deviceIDs, CLdeviceIDs)
+	}
+
 	// Check the number of intensities/work sizes versus the number of devices.
 	userSetWorkSize := true
 	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
@@ -77,8 +97,16 @@ func NewMiner() (*Miner, error) {
 
 	m.devices = make([]*Device, len(deviceIDs))
 	for i, deviceID := range deviceIDs {
+		// Use the real device order so i.e. -D 1 doesn't print GPU #0
+		realnum := i
+		for iCL, CLdeviceID := range CLdeviceIDs {
+			if CLdeviceID == deviceID {
+				realnum = iCL
+			}
+		}
+
 		var err error
-		m.devices[i], err = NewDevice(i, platformID, deviceID, m.workDone)
+		m.devices[i], err = NewDevice(realnum, platformID, deviceID, m.workDone)
 		if err != nil {
 			return nil, err
 		}

From 4311a398116adb1be7a4f56f771dc63b9ffea68d Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 10 Aug 2016 11:36:42 -0400
Subject: [PATCH 034/150] Add import that wasn't seen after last rebase.

---
 miner.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/miner.go b/miner.go
index 1743401..41197ee 100644
--- a/miner.go
+++ b/miner.go
@@ -9,6 +9,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/stratum"
 	"github.com/decred/gominer/work"
 )

From 4f2d892bbd65a83594460c1608d7ac1880e9824b Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 10 Aug 2016 15:16:03 -0400
Subject: [PATCH 035/150] Bump for v0.3.0

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index e788cbe..0418a27 100644
--- a/version.go
+++ b/version.go
@@ -31,7 +31,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 2
+	appMinor uint = 3
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From 093470d6b738a290dff1e32905b1b82b6fd99b32 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 11 Aug 2016 08:13:21 -0500
Subject: [PATCH 036/150] fix mining when no device is specified (#59)

---
 miner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miner.go b/miner.go
index 41197ee..267bc99 100644
--- a/miner.go
+++ b/miner.go
@@ -71,7 +71,7 @@ func NewMiner() (*Miner, error) {
 			}
 		}
 	} else {
-		copy(deviceIDs, CLdeviceIDs)
+		deviceIDs = CLdeviceIDs
 	}
 
 	// Check the number of intensities/work sizes versus the number of devices.

From 5605c021966cf3df278a748774c499877a1216f5 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Mon, 15 Aug 2016 15:00:12 -0500
Subject: [PATCH 037/150] add license for OpenCL bindings (#65)

---
 cl/LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 cl/LICENSE

diff --git a/cl/LICENSE b/cl/LICENSE
new file mode 100644
index 0000000..fa5e41a
--- /dev/null
+++ b/cl/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Rain Liu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From 853bb24cb405b56b5650afa4e2a620ff0402abc1 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Mon, 22 Aug 2016 09:52:07 -0500
Subject: [PATCH 038/150] track invalid shares properly (#60)

---
 miner.go           | 22 ++++++----------------
 stratum/stratum.go | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/miner.go b/miner.go
index 267bc99..10c2c96 100644
--- a/miner.go
+++ b/miner.go
@@ -152,10 +152,10 @@ func (m *Miner) workSubmitThread() {
 					m.needsWorkRefresh <- struct{}{}
 				}
 			} else {
-				accepted, err := GetPoolWorkSubmit(data, m.pool)
+				submitted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
 					switch err {
-					case stratum.ErrStatumStaleWork:
+					case stratum.ErrStratumStaleWork:
 						stale := atomic.LoadUint64(&m.staleShares)
 						stale++
 						atomic.StoreUint64(&m.staleShares, stale)
@@ -170,19 +170,9 @@ func (m *Miner) workSubmitThread() {
 						minrLog.Errorf("Error submitting work to pool: %v", err)
 					}
 				} else {
-					if accepted {
-						val := atomic.LoadUint64(&m.validShares)
-						val++
-						atomic.StoreUint64(&m.validShares, val)
-
+					if submitted {
 						minrLog.Debugf("Submitted work to pool successfully: %v",
-							accepted)
-					} else {
-						inval := atomic.LoadUint64(&m.invalidShares)
-						inval++
-						atomic.StoreUint64(&m.invalidShares, inval)
-
-						m.invalidShares++
+							submitted)
 					}
 					m.needsWorkRefresh <- struct{}{}
 				}
@@ -241,10 +231,10 @@ func (m *Miner) printStatsThread() {
 
 	for {
 		if cfg.Pool != "" && !cfg.Benchmark {
-			valid := atomic.LoadUint64(&m.validShares)
+			valid := atomic.LoadUint64(&m.pool.ValidShares)
 			minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
 				valid,
-				atomic.LoadUint64(&m.invalidShares),
+				atomic.LoadUint64(&m.pool.InvalidShares),
 				atomic.LoadUint64(&m.staleShares))
 
 			secondsElapsed := uint32(time.Now().Unix()) - m.started
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 2428202..3660a2c 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -35,8 +35,8 @@ import (
 
 var chainParams = &chaincfg.MainNetParams
 
-// ErrStatumStaleWork indicates that the work to send to the pool was stale.
-var ErrStatumStaleWork = fmt.Errorf("Stale work, throwing away")
+// ErrStratumStaleWork indicates that the work to send to the pool was stale.
+var ErrStratumStaleWork = fmt.Errorf("Stale work, throwing away")
 
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
@@ -54,6 +54,9 @@ type Stratum struct {
 	Submitted     bool
 	PoolWork      NotifyWork
 	latestJobTime uint32
+
+	ValidShares   uint64
+	InvalidShares uint64
 }
 
 // Config holdes the config options that may be used by a stratum pool.
@@ -306,8 +309,15 @@ func (s *Stratum) handleBasicReply(resp interface{}) {
 	}
 	if aResp.ID == s.submitID {
 		if aResp.Result {
+			val := atomic.LoadUint64(&s.ValidShares)
+			val++
+			atomic.StoreUint64(&s.ValidShares, val)
 			log.Debug("Share accepted")
 		} else {
+			inval := atomic.LoadUint64(&s.InvalidShares)
+			inval++
+			atomic.StoreUint64(&s.InvalidShares, inval)
+
 			log.Error("Share rejected: ", aResp.Error.ErrStr)
 		}
 		s.Submitted = false
@@ -953,7 +963,7 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 
 	latestWorkTs := atomic.LoadUint32(&s.latestJobTime)
 	if uint32(submittedHeader.Timestamp.Unix()) != latestWorkTs {
-		return sub, ErrStatumStaleWork
+		return sub, ErrStratumStaleWork
 	}
 
 	// The timestamp string should be:
@@ -970,7 +980,6 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	nonceStr := fmt.Sprintf("%08x", submittedHeader.Nonce)
 	xnonceStr := hex.EncodeToString(data[144:156])
 
-	// pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr
 	sub.Params = []string{s.cfg.User, s.PoolWork.JobID, xnonceStr, timestampStr,
 		nonceStr}
 

From 65dc98717fd4e91a34b8e123caa5ff0532f1e376 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Wed, 24 Aug 2016 15:49:42 -0500
Subject: [PATCH 039/150] make the autocalibration/device/intensity/worksize
 flags consistent (#68)

---
 config.go | 197 +++++++++++++++++++++++++++++++++++++-----------------
 device.go |  56 +++++++++++-----
 miner.go  |  25 +------
 3 files changed, 177 insertions(+), 101 deletions(-)

diff --git a/config.go b/config.go
index 8c31f28..a7a7cfa 100644
--- a/config.go
+++ b/config.go
@@ -8,7 +8,6 @@ import (
 	"net"
 	"os"
 	"path/filepath"
-	"reflect"
 	"sort"
 	"strconv"
 	"strings"
@@ -33,8 +32,6 @@ var (
 	defaultRPCCertFile   = filepath.Join(dcrdHomeDir, "rpc.cert")
 	defaultLogDir        = filepath.Join(minerHomeDir, defaultLogDirname)
 	defaultAutocalibrate = 500
-	defaultIntensity     = []string{}
-	defaultWorkSize      = []string{}
 
 	// Took these values from cgminer.
 	minIntensity = 8
@@ -73,13 +70,14 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
-	Autocalibrate int    `short:"A" long:"autocalibrate" description:"Use GPU autocalibration to achieve a kernel execution timing of the passed number of milliseconds"`
-	Devices       string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
-	DeviceIDs     []int
-	Intensity     []string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device, use multiple flags for multiple devices"`
-	IntensityInts []int
-	WorkSize      []string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity), use multiple flags for multiple devices"`
-	WorkSizeInts  []int
+	Autocalibrate     string `short:"A" long:"autocalibrate" description:"GPU kernel execution target time in milliseconds. Single global value or a comma separated list."`
+	AutocalibrateInts []int
+	Devices           string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
+	DeviceIDs         []int
+	Intensity         string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device. Single global value or a comma separated list."`
+	IntensityInts     []int
+	WorkSize          string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity). Single global value or a comma separated list."`
+	WorkSizeInts      []int
 
 	// Pool related options
 	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port)"`
@@ -230,15 +228,12 @@ func cleanAndExpandPath(path string) string {
 func loadConfig() (*config, []string, error) {
 	// Default config.
 	cfg := config{
-		ConfigFile:    defaultConfigFile,
-		DebugLevel:    defaultLogLevel,
-		LogDir:        defaultLogDir,
-		RPCServer:     defaultRPCServer,
-		RPCCert:       defaultRPCCertFile,
-		Autocalibrate: defaultAutocalibrate,
-		Intensity:     defaultIntensity,
-		ClKernel:      defaultClKernel,
-		WorkSize:      defaultWorkSize,
+		ConfigFile: defaultConfigFile,
+		DebugLevel: defaultLogLevel,
+		LogDir:     defaultLogDir,
+		RPCServer:  defaultRPCServer,
+		RPCCert:    defaultRPCCertFile,
+		ClKernel:   defaultClKernel,
 	}
 
 	// Create the home directory if it doesn't already exist.
@@ -313,6 +308,43 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
+	// Check the autocalibrations if the user is setting that.
+	if len(cfg.Autocalibrate) > 0 {
+		// Parse a list like -A 450,600
+		if strings.Contains(cfg.Autocalibrate, ",") {
+			specifiedAutocalibrates := strings.Split(cfg.Autocalibrate, ",")
+			cfg.AutocalibrateInts = make([]int, len(specifiedAutocalibrates))
+			for i := range specifiedAutocalibrates {
+				j, err := strconv.Atoi(specifiedAutocalibrates[i])
+				if err != nil {
+					err := fmt.Errorf("Could not convert autocalibration "+
+						"(%v) to int: %s", specifiedAutocalibrates[i],
+						err.Error())
+					fmt.Fprintln(os.Stderr, err)
+					return nil, nil, err
+				}
+
+				cfg.AutocalibrateInts[i] = j
+			}
+			// Use specified device like -A 600
+		} else {
+			cfg.AutocalibrateInts = make([]int, 1)
+			i, err := strconv.Atoi(cfg.Autocalibrate)
+			if err != nil {
+				err := fmt.Errorf("Could not convert autocalibration %v "+
+					"to int: %s", cfg.Autocalibrate, err.Error())
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+
+			cfg.AutocalibrateInts[0] = i
+		}
+		// Apply default
+	} else {
+		cfg.AutocalibrateInts = []int{defaultAutocalibrate}
+	}
+
+	// Check the devices if the user is setting that.
 	if len(cfg.Devices) > 0 {
 		// Parse a list like -D 1,2
 		if strings.Contains(cfg.Devices, ",") {
@@ -322,7 +354,8 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedDevices[i])
 				if err != nil {
 					err := fmt.Errorf("Could not convert device number %v "+
-						"(%v) to int: %s", i+1, specifiedDevices[i], err.Error())
+						"(%v) to int: %s", i+1, specifiedDevices[i],
+						err.Error())
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -344,61 +377,101 @@ func loadConfig() (*config, []string, error) {
 		}
 	}
 
-	// Check the intensities if the user is setting that.
-	cfg.IntensityInts = make([]int, len(cfg.Intensity))
-	if !reflect.DeepEqual(cfg.Intensity, defaultIntensity) {
-		for i := range cfg.Intensity {
-			var err error
-			cfg.IntensityInts[i], err = strconv.Atoi(cfg.Intensity[i])
+	// Check the intensity if the user is setting that.
+	if len(cfg.Intensity) > 0 {
+		// Parse a list like -i 29,30
+		if strings.Contains(cfg.Intensity, ",") {
+			specifiedIntensities := strings.Split(cfg.Intensity, ",")
+			cfg.IntensityInts = make([]int, len(specifiedIntensities))
+			for i := range specifiedIntensities {
+				j, err := strconv.Atoi(specifiedIntensities[i])
+				if err != nil {
+					err := fmt.Errorf("Could not convert intensity "+
+						"(%v) to int: %s", specifiedIntensities[i],
+						err.Error())
+					fmt.Fprintln(os.Stderr, err)
+					return nil, nil, err
+				}
+
+				cfg.IntensityInts[i] = j
+			}
+			// Use specified intensity like -i 29
+		} else {
+			cfg.IntensityInts = make([]int, 1)
+			i, err := strconv.Atoi(cfg.Intensity)
 			if err != nil {
-				err := fmt.Errorf("Could not convert intensity number %v "+
-					"(%v) to int: %s", i, cfg.Intensity[i], err.Error())
+				err := fmt.Errorf("Could not convert intensity %v "+
+					"to int: %s", cfg.Intensity, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
 
-			if (cfg.IntensityInts[i] < minIntensity) ||
-				(cfg.IntensityInts[i] > maxIntensity) {
-				err := fmt.Errorf("Intensity %v (device %v) not within "+
-					"range %v to %v.", cfg.IntensityInts[i], i, minIntensity,
-					maxIntensity)
-				fmt.Fprintln(os.Stderr, err)
-				return nil, nil, err
-			}
+			cfg.IntensityInts[0] = i
 		}
 	}
 
-	// Check the work size.
-	cfg.WorkSizeInts = make([]int, len(cfg.WorkSize))
-	if !reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		for i := range cfg.WorkSize {
-			var err error
-			cfg.WorkSizeInts[i], err = strconv.Atoi(cfg.WorkSize[i])
-			if err != nil {
-				err := fmt.Errorf("Could not convert work size number %v "+
-					"(%v) to int: %s", i, cfg.Intensity[i], err.Error())
-				fmt.Fprintln(os.Stderr, err)
-				return nil, nil, err
-			}
+	for i := range cfg.IntensityInts {
+		if (cfg.IntensityInts[i] < minIntensity) ||
+			(cfg.IntensityInts[i] > maxIntensity) {
+			err := fmt.Errorf("Intensity %v not within "+
+				"range %v to %v.", cfg.IntensityInts[i], minIntensity,
+				maxIntensity)
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+	}
 
-			if cfg.WorkSizeInts[i] < 0 {
-				err := fmt.Errorf("Zero or negative WorkSize passed: %v",
-					cfg.WorkSizeInts[i])
-				fmt.Fprintln(os.Stderr, err)
-				return nil, nil, err
-			}
-			if cfg.WorkSizeInts[i] > maxWorkSize {
-				err := fmt.Errorf("Too big WorkSize passed: %v, max %v",
-					cfg.WorkSizeInts[i], maxWorkSize)
-				fmt.Fprintln(os.Stderr, err)
-				return nil, nil, err
+	// Check the work size if the user is setting that.
+	if len(cfg.WorkSize) > 0 {
+		// Parse a list like -W 536870912,1073741824
+		if strings.Contains(cfg.WorkSize, ",") {
+			specifiedWorkSizes := strings.Split(cfg.WorkSize, ",")
+			cfg.WorkSizeInts = make([]int, len(specifiedWorkSizes))
+			for i := range specifiedWorkSizes {
+				j, err := strconv.Atoi(specifiedWorkSizes[i])
+				if err != nil {
+					err := fmt.Errorf("Could not convert worksize "+
+						"(%v) to int: %s", specifiedWorkSizes[i],
+						err.Error())
+					fmt.Fprintln(os.Stderr, err)
+					return nil, nil, err
+				}
+
+				cfg.WorkSizeInts[i] = j
 			}
-			if cfg.WorkSizeInts[i]%256 != 0 {
-				err := fmt.Errorf("Work size %v not a multiple of 256",
-					cfg.WorkSizeInts[i])
+			// Use specified worksize like -W 1073741824
+		} else {
+			cfg.WorkSizeInts = make([]int, 1)
+			i, err := strconv.Atoi(cfg.WorkSize)
+			if err != nil {
+				err := fmt.Errorf("Could not convert worksize %v "+
+					"to int: %s", cfg.WorkSize, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
+
+			cfg.WorkSizeInts[0] = i
+		}
+	}
+
+	for i := range cfg.WorkSizeInts {
+		if cfg.WorkSizeInts[i] < 0 {
+			err := fmt.Errorf("Zero or negative WorkSize passed: %v",
+				cfg.WorkSizeInts[i])
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+		if cfg.WorkSizeInts[i] > maxWorkSize {
+			err := fmt.Errorf("Too big WorkSize passed: %v, max %v",
+				cfg.WorkSizeInts[i], maxWorkSize)
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+		if cfg.WorkSizeInts[i]%256 != 0 {
+			err := fmt.Errorf("Work size %v not a multiple of 256",
+				cfg.WorkSizeInts[i])
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
 		}
 	}
 
diff --git a/device.go b/device.go
index 6974c5e..33c2157 100644
--- a/device.go
+++ b/device.go
@@ -11,7 +11,6 @@ import (
 	"math"
 	"math/big"
 	"os"
-	"reflect"
 	"sync"
 	"time"
 	"unsafe"
@@ -162,7 +161,7 @@ func ListDevices() {
 	}
 }
 
-func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
+func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
 	workDone chan []byte) (*Device, error) {
 	d := &Device{
 		index:      index,
@@ -225,14 +224,14 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 			minrLog.Errorf("Could not obtain compilation error log: %v",
 				clError(status, "CLGetProgramBuildInfo"))
 		}
-		var program_log interface{}
+		var programLog interface{}
 		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
-			cl.CL_PROGRAM_BUILD_LOG, logSize, &program_log, nil)
+			cl.CL_PROGRAM_BUILD_LOG, logSize, &programLog, nil)
 		if status != cl.CL_SUCCESS {
 			minrLog.Errorf("Could not obtain compilation error log: %v",
 				clError(status, "CLGetProgramBuildInfo"))
 		}
-		minrLog.Errorf("%s\n", program_log)
+		minrLog.Errorf("%s\n", programLog)
 
 		return nil, err
 	}
@@ -248,31 +247,58 @@ func NewDevice(index int, platformID cl.CL_platform_id, deviceID cl.CL_device_id
 	// Autocalibrate the desired work size for the kernel, or use one of the
 	// values passed explicitly by the use.
 	// The intensity or worksize must be set by the user.
-	userSetWorkSize := true
-	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
-		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		userSetWorkSize = false
+	userSetWorkSize := false
+	if len(cfg.IntensityInts) > 0 || len(cfg.WorkSizeInts) > 0 {
+		userSetWorkSize = true
 	}
 
 	var globalWorkSize uint32
 	if !userSetWorkSize {
-		idealWorkSize, err := d.calcWorkSizeForMilliseconds(cfg.Autocalibrate)
+		// Apply the first setting as a global setting
+		calibrateTime := cfg.AutocalibrateInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.AutocalibrateInts {
+			if i == order {
+				calibrateTime = cfg.AutocalibrateInts[i]
+			}
+		}
+
+		idealWorkSize, err := d.calcWorkSizeForMilliseconds(calibrateTime)
 		if err != nil {
 			return nil, err
 		}
 
 		minrLog.Debugf("Autocalibration successful, work size for %v"+
 			"ms per kernel execution on device %v determined to be %v",
-			cfg.Autocalibrate, d.index, idealWorkSize)
+			calibrateTime, d.index, idealWorkSize)
 
 		globalWorkSize = idealWorkSize
 	} else {
-		if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-			globalWorkSize = 1 << uint32(cfg.IntensityInts[d.index])
-		} else {
-			globalWorkSize = uint32(cfg.WorkSizeInts[d.index])
+		if len(cfg.IntensityInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = 1 << uint32(cfg.IntensityInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.IntensityInts {
+				if i == order {
+					globalWorkSize = 1 << uint32(cfg.IntensityInts[order])
+				}
+			}
+		}
+		if len(cfg.WorkSizeInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = uint32(cfg.WorkSizeInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.WorkSizeInts {
+				if i == order {
+					globalWorkSize = uint32(cfg.WorkSizeInts[order])
+				}
+			}
 		}
 	}
+
 	intensity := math.Log2(float64(globalWorkSize))
 	minrLog.Infof("GPU #%d: Work size set to %v ('intensity' %v)",
 		d.index, globalWorkSize, intensity)
diff --git a/miner.go b/miner.go
index 10c2c96..6c5b370 100644
--- a/miner.go
+++ b/miner.go
@@ -4,7 +4,6 @@ package main
 
 import (
 	"fmt"
-	"reflect"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -74,28 +73,6 @@ func NewMiner() (*Miner, error) {
 		deviceIDs = CLdeviceIDs
 	}
 
-	// Check the number of intensities/work sizes versus the number of devices.
-	userSetWorkSize := true
-	if reflect.DeepEqual(cfg.Intensity, defaultIntensity) &&
-		reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-		userSetWorkSize = false
-	}
-	if userSetWorkSize {
-		if reflect.DeepEqual(cfg.WorkSize, defaultWorkSize) {
-			if len(cfg.Intensity) != len(deviceIDs) {
-				return nil, fmt.Errorf("Intensities supplied, but number supplied "+
-					"did not match the number of GPUs (got %v, want %v)",
-					len(cfg.Intensity), len(deviceIDs))
-			}
-		} else {
-			if len(cfg.WorkSize) != len(deviceIDs) {
-				return nil, fmt.Errorf("WorkSize supplied, but number supplied "+
-					"did not match the number of GPUs (got %v, want %v)",
-					len(cfg.WorkSize), len(deviceIDs))
-			}
-		}
-	}
-
 	m.devices = make([]*Device, len(deviceIDs))
 	for i, deviceID := range deviceIDs {
 		// Use the real device order so i.e. -D 1 doesn't print GPU #0
@@ -107,7 +84,7 @@ func NewMiner() (*Miner, error) {
 		}
 
 		var err error
-		m.devices[i], err = NewDevice(realnum, platformID, deviceID, m.workDone)
+		m.devices[i], err = NewDevice(realnum, i, platformID, deviceID, m.workDone)
 		if err != nil {
 			return nil, err
 		}

From f2ad4777b8d2b5a9f621444ae5ae680e912cb95c Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 25 Aug 2016 10:42:17 -0500
Subject: [PATCH 040/150] some cleanups to appease go clean/go vet (#69)

---
 device.go | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/device.go b/device.go
index 33c2157..3d0b786 100644
--- a/device.go
+++ b/device.go
@@ -67,31 +67,31 @@ func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
 }
 
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
-	var program_buffer [1][]byte
-	var program_size [1]cl.CL_size_t
+	var programBuffer [1][]byte
+	var programSize [1]cl.CL_size_t
 
 	// Read each program file and place content into buffer array.
-	program_handle, err := os.Open(filename)
+	programHandle, err := os.Open(filename)
 	if err != nil {
 		return nil, nil, err
 	}
-	defer program_handle.Close()
+	defer programHandle.Close()
 
 	buf := bytes.NewBuffer(nil)
-	_, err = io.Copy(buf, program_handle)
+	_, err = io.Copy(buf, programHandle)
 	if err != nil {
 		return nil, nil, err
 	}
 	str := string(buf.Bytes())
-	program_final := []byte(str)
+	programFinal := []byte(str)
 
-	program_size[0] = cl.CL_size_t(len(program_final))
-	program_buffer[0] = make([]byte, program_size[0])
-	for i := range program_final {
-		program_buffer[0][i] = program_final[i]
+	programSize[0] = cl.CL_size_t(len(programFinal))
+	programBuffer[0] = make([]byte, programSize[0])
+	for i := range programFinal {
+		programBuffer[0][i] = programFinal[i]
 	}
 
-	return program_buffer[:], program_size[:], nil
+	return programBuffer[:], programSize[:], nil
 }
 
 type Device struct {
@@ -134,7 +134,7 @@ type Device struct {
 
 func clError(status cl.CL_int, f string) error {
 	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
-		return fmt.Errorf("%s returned unknown error!")
+		return fmt.Errorf("returned unknown error")
 	}
 
 	return fmt.Errorf("%s returned error %s (%d)", f,
@@ -541,10 +541,10 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 			"minimum target %032x", d.index, hash, d.work.Target.Bytes())
 		d.invalidShares++
 		return
-	} else {
-		d.allDiffOneShares++
 	}
 
+	d.allDiffOneShares++
+
 	if !cfg.Benchmark {
 		// Assess versus the pool or daemon target.
 		if hashNum.Cmp(d.work.Target) > 0 {

From 6a68c00ee649e0d589f7b35f8c0980d877ede6c2 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Fri, 26 Aug 2016 11:01:02 -0500
Subject: [PATCH 041/150] fix build on 32-bit platforms and properly error on
 too small worksizes (#70)

---
 config.go | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/config.go b/config.go
index a7a7cfa..4f6e753 100644
--- a/config.go
+++ b/config.go
@@ -33,10 +33,9 @@ var (
 	defaultLogDir        = filepath.Join(minerHomeDir, defaultLogDirname)
 	defaultAutocalibrate = 500
 
-	// Took these values from cgminer.
 	minIntensity = 8
 	maxIntensity = 31
-	maxWorkSize  = 0xFFFFFFFF
+	maxWorkSize  = uint32(0xFFFFFFFF - 255)
 )
 
 type config struct {
@@ -77,7 +76,7 @@ type config struct {
 	Intensity         string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device. Single global value or a comma separated list."`
 	IntensityInts     []int
 	WorkSize          string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity). Single global value or a comma separated list."`
-	WorkSizeInts      []int
+	WorkSizeInts      []uint32
 
 	// Pool related options
 	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port)"`
@@ -426,7 +425,7 @@ func loadConfig() (*config, []string, error) {
 		// Parse a list like -W 536870912,1073741824
 		if strings.Contains(cfg.WorkSize, ",") {
 			specifiedWorkSizes := strings.Split(cfg.WorkSize, ",")
-			cfg.WorkSizeInts = make([]int, len(specifiedWorkSizes))
+			cfg.WorkSizeInts = make([]uint32, len(specifiedWorkSizes))
 			for i := range specifiedWorkSizes {
 				j, err := strconv.Atoi(specifiedWorkSizes[i])
 				if err != nil {
@@ -437,11 +436,11 @@ func loadConfig() (*config, []string, error) {
 					return nil, nil, err
 				}
 
-				cfg.WorkSizeInts[i] = j
+				cfg.WorkSizeInts[i] = uint32(j)
 			}
 			// Use specified worksize like -W 1073741824
 		} else {
-			cfg.WorkSizeInts = make([]int, 1)
+			cfg.WorkSizeInts = make([]uint32, 1)
 			i, err := strconv.Atoi(cfg.WorkSize)
 			if err != nil {
 				err := fmt.Errorf("Could not convert worksize %v "+
@@ -450,13 +449,13 @@ func loadConfig() (*config, []string, error) {
 				return nil, nil, err
 			}
 
-			cfg.WorkSizeInts[0] = i
+			cfg.WorkSizeInts[0] = uint32(i)
 		}
 	}
 
 	for i := range cfg.WorkSizeInts {
-		if cfg.WorkSizeInts[i] < 0 {
-			err := fmt.Errorf("Zero or negative WorkSize passed: %v",
+		if cfg.WorkSizeInts[i] < 256 {
+			err := fmt.Errorf("Too small WorkSize passed: %v, min 256",
 				cfg.WorkSizeInts[i])
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err

From d45aa16d7d8d471e7367092b6d80e82b781ce5f4 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Mon, 29 Aug 2016 13:26:24 -0500
Subject: [PATCH 042/150] properly account for multiple OpenCL platforms (#71)

---
 calibrate.go |  6 ++---
 config.go    |  2 +-
 device.go    | 42 +++++++++++++++++---------------
 miner.go     | 67 ++++++++++++++++++++++++++--------------------------
 4 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/calibrate.go b/calibrate.go
index 6c410e6..184d319 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -16,7 +16,7 @@ func (d *Device) getKernelExecutionTime(globalWorksize uint32) (time.Duration,
 	error) {
 	d.work = work.Work{}
 
-	minrLog.Tracef("Started GPU #%d: %s for kernel execution time fetch",
+	minrLog.Tracef("Started DEV #%d: %s for kernel execution time fetch",
 		d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 
@@ -85,14 +85,14 @@ func (d *Device) getKernelExecutionTime(globalWorksize uint32) (time.Duration,
 	}
 
 	elapsedTime := time.Since(currentTime)
-	minrLog.Tracef("GPU #%d: Kernel execution to read time for work "+
+	minrLog.Tracef("DEV #%d: Kernel execution to read time for work "+
 		"size calibration: %v", d.index, elapsedTime)
 
 	return elapsedTime, nil
 }
 
 // calcWorkSizeForMilliseconds calculates the correct worksize to achieve
-// a GPU execution cycle of the passed duration in milliseconds.
+// a device execution cycle of the passed duration in milliseconds.
 func (d *Device) calcWorkSizeForMilliseconds(ms int) (uint32, error) {
 	workSize := uint32(1 << 10)
 	timeToAchieve := time.Duration(ms) * time.Millisecond
diff --git a/config.go b/config.go
index 4f6e753..0fb6a01 100644
--- a/config.go
+++ b/config.go
@@ -69,7 +69,7 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
-	Autocalibrate     string `short:"A" long:"autocalibrate" description:"GPU kernel execution target time in milliseconds. Single global value or a comma separated list."`
+	Autocalibrate     string `short:"A" long:"autocalibrate" description:"Time target in milliseconds to spend executing hashes on the device during each iteration. Single global value or a comma separated list."`
 	AutocalibrateInts []int
 	Devices           string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
 	DeviceIDs         []int
diff --git a/device.go b/device.go
index 3d0b786..feff12f 100644
--- a/device.go
+++ b/device.go
@@ -52,7 +52,7 @@ func getCLPlatforms() ([]cl.CL_platform_id, error) {
 // getCLDevices returns the list of devices for the given platform.
 func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
 	var numDevices cl.CL_uint
-	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_GPU, 0, nil,
+	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
 		&numDevices)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLGetDeviceIDs")
@@ -141,7 +141,7 @@ func clError(status cl.CL_int, f string) error {
 		cl.ERROR_CODES_STRINGS[-status], status)
 }
 
-// ListDevices prints a list of GPUs present.
+// ListDevices prints a list of devices present.
 func ListDevices() {
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
@@ -149,15 +149,19 @@ func ListDevices() {
 		os.Exit(1)
 	}
 
-	platformID := platformIDs[0]
-	deviceIDs, err := getCLDevices(platformID)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
-		os.Exit(1)
-	}
+	deviceListIndex := 0
+	for i := range platformIDs {
+		platformID := platformIDs[i]
+		deviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
+			os.Exit(1)
+		}
 
-	for i, deviceID := range deviceIDs {
-		fmt.Printf("GPU #%d: %s\n", i, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
+		for _, deviceID := range deviceIDs {
+			fmt.Printf("DEV #%d: %s\n", deviceListIndex, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
+			deviceListIndex++
+		}
 	}
 }
 
@@ -300,7 +304,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	}
 
 	intensity := math.Log2(float64(globalWorkSize))
-	minrLog.Infof("GPU #%d: Work size set to %v ('intensity' %v)",
+	minrLog.Infof("DEV #%d: Work size set to %v ('intensity' %v)",
 		d.index, globalWorkSize, intensity)
 	d.workSize = globalWorkSize
 
@@ -406,11 +410,11 @@ func (d *Device) testFoundCandidate() {
 }
 
 func (d *Device) runDevice() error {
-	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
+	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 
 	// Bump the extraNonce for the device it's running on
-	// when you begin mining. This ensures each GPU is doing
+	// when you begin mining. This ensures each device is doing
 	// different work. If the extraNonce has already been
 	// set for valid work, restore that.
 	d.extraNonce += uint32(d.index) << 24
@@ -502,7 +506,7 @@ func (d *Device) runDevice() error {
 		}
 
 		for i := uint32(0); i < outputData[0]; i++ {
-			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
+			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
 				util.Uint32EndiannessSwap(d.currentWorkID),
@@ -516,7 +520,7 @@ func (d *Device) runDevice() error {
 		}
 
 		elapsedTime := time.Since(currentTime)
-		minrLog.Tracef("GPU #%d: Kernel execution to read time: %v", d.index,
+		minrLog.Tracef("DEV #%d: Kernel execution to read time: %v", d.index,
 			elapsedTime)
 	}
 }
@@ -537,7 +541,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	// work check are considered to be hardware errors.
 	hashNum := blockchain.ShaHashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
-		minrLog.Errorf("GPU #%d: Hardware error found, hash %v above "+
+		minrLog.Errorf("DEV #%d: Hardware error found, hash %v above "+
 			"minimum target %032x", d.index, hash, d.work.Target.Bytes())
 		d.invalidShares++
 		return
@@ -548,10 +552,10 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	if !cfg.Benchmark {
 		// Assess versus the pool or daemon target.
 		if hashNum.Cmp(d.work.Target) > 0 {
-			minrLog.Debugf("GPU #%d: Hash %v bigger than target %032x (boo)",
+			minrLog.Debugf("DEV #%d: Hash %v bigger than target %032x (boo)",
 				d.index, hash, d.work.Target.Bytes())
 		} else {
-			minrLog.Infof("GPU #%d: Found hash with work below target! %v (yay)",
+			minrLog.Infof("DEV #%d: Found hash with work below target! %v (yay)",
 				d.index, hash)
 			d.validShares++
 			d.workDone <- data
@@ -604,7 +608,7 @@ func (d *Device) PrintStats() {
 		float64(d.allDiffOneShares)) /
 		float64(secondsElapsed)
 
-	minrLog.Infof("GPU #%d (%s) reporting average hash rate %v, %v/%v valid work",
+	minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work",
 		d.index,
 		d.deviceName,
 		util.FormatHashRate(averageHashRate),
diff --git a/miner.go b/miner.go
index 6c5b370..f09e8e5 100644
--- a/miner.go
+++ b/miner.go
@@ -8,7 +8,6 @@ import (
 	"sync/atomic"
 	"time"
 
-	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/stratum"
 	"github.com/decred/gominer/work"
 )
@@ -34,6 +33,8 @@ func NewMiner() (*Miner, error) {
 		needsWorkRefresh: make(chan struct{}),
 	}
 
+	m.devices = make([]*Device, 0)
+
 	// If needed, start pool code.
 	if cfg.Pool != "" && !cfg.Benchmark {
 		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version())
@@ -47,47 +48,45 @@ func NewMiner() (*Miner, error) {
 	if err != nil {
 		return nil, fmt.Errorf("Could not get CL platforms: %v", err)
 	}
-	platformID := platformIDs[0]
-	CLdeviceIDs, err := getCLDevices(platformID)
-	if err != nil {
-		return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
-	}
 
-	var deviceIDs []cl.CL_device_id
+	deviceListIndex := 0
+	deviceListEnabledCount := 0
+
+	for p := range platformIDs {
+		platformID := platformIDs[p]
+		CLdeviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+		}
+
+		for _, CLdeviceID := range CLdeviceIDs {
+			miningAllowed := false
 
-	// Enforce device restrictions if they exist
-	if len(cfg.DeviceIDs) > 0 {
-		for _, i := range cfg.DeviceIDs {
-			var found = false
-			for j, CLdeviceID := range CLdeviceIDs {
-				if i == j {
-					deviceIDs = append(deviceIDs, CLdeviceID)
-					found = true
+			// Enforce device restrictions if they exist
+			if len(cfg.DeviceIDs) > 0 {
+				for _, i := range cfg.DeviceIDs {
+					if deviceListIndex == i {
+						miningAllowed = true
+					}
 				}
+			} else {
+				miningAllowed = true
 			}
-			if !found {
-				return nil, fmt.Errorf("Unable to find GPU #%d", i)
-			}
-		}
-	} else {
-		deviceIDs = CLdeviceIDs
-	}
 
-	m.devices = make([]*Device, len(deviceIDs))
-	for i, deviceID := range deviceIDs {
-		// Use the real device order so i.e. -D 1 doesn't print GPU #0
-		realnum := i
-		for iCL, CLdeviceID := range CLdeviceIDs {
-			if CLdeviceID == deviceID {
-				realnum = iCL
+			if miningAllowed {
+				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
+				deviceListEnabledCount++
+				m.devices = append(m.devices, newDevice)
+				if err != nil {
+					return nil, err
+				}
 			}
+			deviceListIndex++
 		}
+	}
 
-		var err error
-		m.devices[i], err = NewDevice(realnum, i, platformID, deviceID, m.workDone)
-		if err != nil {
-			return nil, err
-		}
+	if deviceListEnabledCount == 0 {
+		return nil, fmt.Errorf("No devices started")
 	}
 
 	m.started = uint32(time.Now().Unix())

From ffc266fe131c507d767620b06a6b074b375600b0 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Wed, 31 Aug 2016 11:11:14 -0400
Subject: [PATCH 043/150] Cleanup atomic usage. (#74)

---
 miner.go           | 35 +++++++++++------------------------
 stratum/stratum.go | 38 +++++++++++++++++---------------------
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/miner.go b/miner.go
index f09e8e5..66eaa57 100644
--- a/miner.go
+++ b/miner.go
@@ -13,17 +13,18 @@ import (
 )
 
 type Miner struct {
+	// The following variables must only be used atomically.
+	validShares   uint64
+	staleShares   uint64
+	invalidShares uint64
+
+	started          uint32
 	devices          []*Device
 	workDone         chan []byte
 	quit             chan struct{}
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
 	pool             *stratum.Stratum
-
-	started       uint32
-	validShares   uint64
-	staleShares   uint64
-	invalidShares uint64
 }
 
 func NewMiner() (*Miner, error) {
@@ -106,23 +107,15 @@ func (m *Miner) workSubmitThread() {
 			if m.pool == nil {
 				accepted, err := GetWorkSubmit(data)
 				if err != nil {
-					inval := atomic.LoadUint64(&m.invalidShares)
-					inval++
-					atomic.StoreUint64(&m.invalidShares, inval)
-
+					atomic.AddUint64(&m.invalidShares, 1)
 					minrLog.Errorf("Error submitting work: %v", err)
 				} else {
 					if accepted {
-						val := atomic.LoadUint64(&m.validShares)
-						val++
-						atomic.StoreUint64(&m.validShares, val)
-
+						atomic.AddUint64(&m.validShares, 1)
 						minrLog.Debugf("Submitted work successfully: %v",
 							accepted)
 					} else {
-						inval := atomic.LoadUint64(&m.invalidShares)
-						inval++
-						atomic.StoreUint64(&m.invalidShares, inval)
+						atomic.AddUint64(&m.invalidShares, 1)
 					}
 
 					m.needsWorkRefresh <- struct{}{}
@@ -132,17 +125,11 @@ func (m *Miner) workSubmitThread() {
 				if err != nil {
 					switch err {
 					case stratum.ErrStratumStaleWork:
-						stale := atomic.LoadUint64(&m.staleShares)
-						stale++
-						atomic.StoreUint64(&m.staleShares, stale)
-
+						atomic.AddUint64(&m.staleShares, 1)
 						minrLog.Debugf("Share submitted to pool was stale")
 
 					default:
-						inval := atomic.LoadUint64(&m.invalidShares)
-						inval++
-						atomic.StoreUint64(&m.invalidShares, inval)
-
+						atomic.AddUint64(&m.invalidShares, 1)
 						minrLog.Errorf("Error submitting work to pool: %v", err)
 					}
 				} else {
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 3660a2c..bc9de91 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -41,22 +41,23 @@ var ErrStratumStaleWork = fmt.Errorf("Stale work, throwing away")
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
 type Stratum struct {
-	sync.Mutex
-	cfg           Config
-	Conn          net.Conn
-	Reader        *bufio.Reader
-	ID            uint64
-	authID        uint64
-	subID         uint64
-	submitID      uint64
-	Diff          float64
-	Target        *big.Int
-	Submitted     bool
-	PoolWork      NotifyWork
-	latestJobTime uint32
-
+	// The following variables must only be used atomically.
 	ValidShares   uint64
 	InvalidShares uint64
+	latestJobTime uint32
+
+	sync.Mutex
+	cfg       Config
+	Conn      net.Conn
+	Reader    *bufio.Reader
+	ID        uint64
+	authID    uint64
+	subID     uint64
+	submitID  uint64
+	Diff      float64
+	Target    *big.Int
+	Submitted bool
+	PoolWork  NotifyWork
 }
 
 // Config holdes the config options that may be used by a stratum pool.
@@ -309,15 +310,10 @@ func (s *Stratum) handleBasicReply(resp interface{}) {
 	}
 	if aResp.ID == s.submitID {
 		if aResp.Result {
-			val := atomic.LoadUint64(&s.ValidShares)
-			val++
-			atomic.StoreUint64(&s.ValidShares, val)
+			atomic.AddUint64(&s.ValidShares, 1)
 			log.Debug("Share accepted")
 		} else {
-			inval := atomic.LoadUint64(&s.InvalidShares)
-			inval++
-			atomic.StoreUint64(&s.InvalidShares, inval)
-
+			atomic.AddUint64(&s.InvalidShares, 1)
 			log.Error("Share rejected: ", aResp.Error.ErrStr)
 		}
 		s.Submitted = false

From 321c9a19cecc4650ad878d401382017c5301459c Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Wed, 31 Aug 2016 16:49:04 -0400
Subject: [PATCH 044/150] Remove erroneous waitgroup Done in Stop (#76)

---
 miner.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/miner.go b/miner.go
index 66eaa57..8cb0e0f 100644
--- a/miner.go
+++ b/miner.go
@@ -255,6 +255,5 @@ func (m *Miner) Stop() {
 	close(m.quit)
 	for _, d := range m.devices {
 		d.Stop()
-		m.wg.Done()
 	}
 }

From a2dec145590621b849c66e9445cb7713db99825a Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 1 Sep 2016 07:58:27 -0400
Subject: [PATCH 045/150] Bump for v0.4.0

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index 0418a27..7ee119d 100644
--- a/version.go
+++ b/version.go
@@ -31,7 +31,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 3
+	appMinor uint = 4
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From 1fe5f42ce89c121aebe2fb3835759d587d6305b4 Mon Sep 17 00:00:00 2001
From: Josh Rickmar <jrick@devio.us>
Date: Wed, 7 Sep 2016 17:54:30 -0400
Subject: [PATCH 046/150] Print leading zeros in target difficulty. (#79)

---
 device.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/device.go b/device.go
index feff12f..1a29f16 100644
--- a/device.go
+++ b/device.go
@@ -542,7 +542,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	hashNum := blockchain.ShaHashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
 		minrLog.Errorf("DEV #%d: Hardware error found, hash %v above "+
-			"minimum target %032x", d.index, hash, d.work.Target.Bytes())
+			"minimum target %064x", d.index, hash, d.work.Target.Bytes())
 		d.invalidShares++
 		return
 	}

From 408e30891ee0c08cba3fbb44525ea4162e4a5d38 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 16 Aug 2016 09:45:19 -0400
Subject: [PATCH 047/150] Initial support for cuda mining.

Modelled partly after ccminer.

Contains work by jrick, jcv, and jolan.
---
 GNUmakefile     |   28 +
 README.md       |    9 +-
 cldevice.go     |  346 +++++++++
 compat.h        |   94 +++
 config.go       |   11 +-
 cuda_helper.h   |  685 ++++++++++++++++
 cudevice.go     |  250 ++++++
 decred.cu       |  480 ++++++++++++
 device.go       |  363 +--------
 glide.lock      |    6 +-
 miner.go        |   76 +-
 miner.h         |  739 ++++++++++++++++++
 sph/blake.c     | 1133 +++++++++++++++++++++++++++
 sph/sph_blake.h |  337 ++++++++
 sph/sph_types.h | 1976 +++++++++++++++++++++++++++++++++++++++++++++++
 15 files changed, 6180 insertions(+), 353 deletions(-)
 create mode 100644 GNUmakefile
 create mode 100644 cldevice.go
 create mode 100644 compat.h
 create mode 100644 cuda_helper.h
 create mode 100644 cudevice.go
 create mode 100644 decred.cu
 create mode 100644 miner.h
 create mode 100644 sph/blake.c
 create mode 100644 sph/sph_blake.h
 create mode 100644 sph/sph_types.h

diff --git a/GNUmakefile b/GNUmakefile
new file mode 100644
index 0000000..014403d
--- /dev/null
+++ b/GNUmakefile
@@ -0,0 +1,28 @@
+CC ?= gcc
+CXX ?= g++
+NVCC ?= nvcc
+AR ?= ar
+
+.DEFAULT_GOAL := build
+
+obj:
+	mkdir obj
+
+obj/blake.o: obj
+	$(CC) -c sph/blake.c -o obj/blake.o
+
+obj/decred.o: obj
+	$(NVCC) -I. -c decred.cu -o obj/decred.o
+
+obj/cuda.a: obj/blake.o obj/decred.o
+	$(AR) rvs obj/cuda.a obj/blake.o obj/decred.o
+
+build: obj/cuda.a
+	go build
+
+install: obj/cuda.a
+	go install
+
+clean:
+	rm -rf obj
+	go clean
diff --git a/README.md b/README.md
index 87d48b9..089d970 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,10 @@
 
 ## Installation
 
-You need to have OpenCL installed. To download and build gominer, run:
+You need to have the OpenCL and CUDA development libraries
+installed. You only need the runtime and drives for the one you plan
+on running (CUDA for nvidia, OpenCL for anything) To download and
+build gominer, run:
 
 ```
 go get -u github.com/Masterminds/glide
@@ -15,10 +18,10 @@ go install $(glide nv)
 ```
 
 On Ubuntu 16.04 you can install the necessary OpenCL packages (for
-Intel Graphics cards) with
+Intel Graphics cards) and CUDA libraries with:
 
 ```
-sudo apt-get install beignet-dev
+sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
 Other graphics cards will need different libraries.  We have built
diff --git a/cldevice.go b/cldevice.go
new file mode 100644
index 0000000..7183c14
--- /dev/null
+++ b/cldevice.go
@@ -0,0 +1,346 @@
+// Copyright (c) 2016 The Decred developers.
+
+package main
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"time"
+	"unsafe"
+
+	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/util"
+	"github.com/decred/gominer/work"
+)
+
+func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
+	var platformID cl.CL_platform_id
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		return platformID, nil, fmt.Errorf("Could not get CL platforms: %v", err)
+	}
+	platformID = platformIDs[0]
+	CLdeviceIDs, err := getCLDevices(platformID)
+	if err != nil {
+		return platformID, nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+	}
+	return platformID, CLdeviceIDs, nil
+}
+
+func getCLPlatforms() ([]cl.CL_platform_id, error) {
+	var numPlatforms cl.CL_uint
+	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	platforms := make([]cl.CL_platform_id, numPlatforms)
+	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	return platforms, nil
+}
+
+// getCLDevices returns the list of devices for the given platform.
+func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
+	var numDevices cl.CL_uint
+	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
+		&numDevices)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	devices := make([]cl.CL_device_id, numDevices)
+	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
+		devices, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	return devices, nil
+}
+
+// ListDevices prints a list of devices present.
+func ListDevices() {
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Could not get CL platforms: %v\n", err)
+		os.Exit(1)
+	}
+
+	deviceListIndex := 0
+	for i := range platformIDs {
+		platformID := platformIDs[i]
+		deviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
+			os.Exit(1)
+		}
+		for _, deviceID := range deviceIDs {
+			fmt.Printf("DEV #%d: %s\n", deviceListIndex, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
+			deviceListIndex++
+		}
+
+	}
+}
+
+func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
+	workDone chan []byte) (*Device, error) {
+	d := &Device{
+		index:      index,
+		platformID: platformID,
+		deviceID:   deviceID,
+		deviceName: getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
+		quit:       make(chan struct{}),
+		newWork:    make(chan *work.Work, 5),
+		workDone:   workDone,
+	}
+
+	var status cl.CL_int
+
+	// Create the CL context.
+	d.context = cl.CLCreateContext(nil, 1, []cl.CL_device_id{deviceID},
+		nil, nil, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateContext")
+	}
+
+	// Create the command queue.
+	d.queue = cl.CLCreateCommandQueue(d.context, deviceID, 0, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateCommandQueue")
+	}
+
+	// Create the output buffer.
+	d.outputBuffer = cl.CLCreateBuffer(d.context, cl.CL_MEM_READ_WRITE,
+		uint32Size*outputBufferSize, nil, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateBuffer")
+	}
+
+	// Load kernel source.
+	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
+	if err != nil {
+		return nil, fmt.Errorf("Could not load kernel source: %v", err)
+	}
+
+	// Create the program.
+	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:],
+		progSize[:], &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateProgramWithSource")
+	}
+
+	// Build the program for the device.
+	compilerOptions := ""
+	compilerOptions += fmt.Sprintf(" -D WORKSIZE=%d", localWorksize)
+	status = cl.CLBuildProgram(d.program, 1, []cl.CL_device_id{deviceID},
+		[]byte(compilerOptions), nil, nil)
+	if status != cl.CL_SUCCESS {
+		err = clError(status, "CLBuildProgram")
+
+		// Something went wrong! Print what it is.
+		var logSize cl.CL_size_t
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, 0, nil, &logSize)
+		if status != cl.CL_SUCCESS {
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
+		}
+		var programLog interface{}
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, logSize, &programLog, nil)
+		if status != cl.CL_SUCCESS {
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
+		}
+		minrLog.Errorf("%s\n", programLog)
+
+		return nil, err
+	}
+
+	// Create the kernel.
+	d.kernel = cl.CLCreateKernel(d.program, []byte("search"), &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateKernel")
+	}
+
+	d.started = uint32(time.Now().Unix())
+
+	// Autocalibrate the desired work size for the kernel, or use one of the
+	// values passed explicitly by the use.
+	// The intensity or worksize must be set by the user.
+	userSetWorkSize := false
+	if len(cfg.IntensityInts) > 0 || len(cfg.WorkSizeInts) > 0 {
+		userSetWorkSize = true
+	}
+
+	var globalWorkSize uint32
+	if !userSetWorkSize {
+		// Apply the first setting as a global setting
+		calibrateTime := cfg.AutocalibrateInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.AutocalibrateInts {
+			if i == order {
+				calibrateTime = cfg.AutocalibrateInts[i]
+			}
+		}
+
+		idealWorkSize, err := d.calcWorkSizeForMilliseconds(calibrateTime)
+		if err != nil {
+			return nil, err
+		}
+
+		minrLog.Debugf("Autocalibration successful, work size for %v"+
+			"ms per kernel execution on device %v determined to be %v",
+			calibrateTime, d.index, idealWorkSize)
+
+		globalWorkSize = idealWorkSize
+	} else {
+		if len(cfg.IntensityInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = 1 << uint32(cfg.IntensityInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.IntensityInts {
+				if i == order {
+					globalWorkSize = 1 << uint32(cfg.IntensityInts[order])
+				}
+			}
+		}
+		if len(cfg.WorkSizeInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = uint32(cfg.WorkSizeInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.WorkSizeInts {
+				if i == order {
+					globalWorkSize = uint32(cfg.WorkSizeInts[order])
+				}
+			}
+
+		}
+	}
+	intensity := math.Log2(float64(globalWorkSize))
+	minrLog.Infof("DEV #%d: Work size set to %v ('intensity' %v)",
+		d.index, globalWorkSize, intensity)
+	d.workSize = globalWorkSize
+
+	return d, nil
+}
+
+func (d *Device) runDevice() error {
+	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
+	outputData := make([]uint32, outputBufferSize)
+
+	// Bump the extraNonce for the device it's running on
+	// when you begin mining. This ensures each device is doing
+	// different work. If the extraNonce has already been
+	// set for valid work, restore that.
+	d.extraNonce += uint32(d.index) << 24
+	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+	var status cl.CL_int
+	for {
+		d.updateCurrentWork()
+
+		select {
+		case <-d.quit:
+			return nil
+		default:
+		}
+
+		// Increment extraNonce.
+		util.RolloverExtraNonce(&d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+		// Update the timestamp. Only solo work allows you to roll
+		// the timestamp.
+		ts := d.work.JobTime
+		if d.work.IsGetWork {
+			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+			ts = d.work.JobTime + diffSeconds
+		}
+		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+
+		// arg 0: pointer to the buffer
+		obuf := d.outputBuffer
+		status = cl.CLSetKernelArg(d.kernel, 0,
+			cl.CL_size_t(unsafe.Sizeof(obuf)),
+			unsafe.Pointer(&obuf))
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLSetKernelArg")
+		}
+
+		// args 1..8: midstate
+		for i := 0; i < 8; i++ {
+			ms := d.midstate[i]
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1),
+				uint32Size, unsafe.Pointer(&ms))
+			if status != cl.CL_SUCCESS {
+				return clError(status, "CLSetKernelArg")
+			}
+		}
+
+		// args 9..20: lastBlock except nonce
+		i2 := 0
+		for i := 0; i < 12; i++ {
+			if i2 == work.Nonce0Word {
+				i2++
+			}
+			lb := d.lastBlock[i2]
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9),
+				uint32Size, unsafe.Pointer(&lb))
+			if status != cl.CL_SUCCESS {
+				return clError(status, "CLSetKernelArg")
+			}
+			i2++
+		}
+
+		// Clear the found count from the buffer
+		status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer,
+			cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]),
+			0, nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueWriteBuffer")
+		}
+
+		// Execute the kernel and follow its execution time.
+		currentTime := time.Now()
+		var globalWorkSize [1]cl.CL_size_t
+		globalWorkSize[0] = cl.CL_size_t(d.workSize)
+		var localWorkSize [1]cl.CL_size_t
+		localWorkSize[0] = localWorksize
+		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
+			globalWorkSize[:], localWorkSize[:], 0, nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueNDRangeKernel")
+		}
+
+		// Read the output buffer.
+		cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0,
+			uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0,
+			nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueReadBuffer")
+		}
+
+		for i := uint32(0); i < outputData[0]; i++ {
+			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
+				"extraNonce %08x, workID %08x, timestamp %08x",
+				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
+				util.Uint32EndiannessSwap(d.currentWorkID),
+				d.lastBlock[work.TimestampWord])
+
+			// Assess the work. If it's below target, it'll be rejected
+			// here. The mining algorithm currently sends this function any
+			// difficulty 1 shares.
+			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
+				d.lastBlock[work.Nonce1Word])
+		}
+
+		elapsedTime := time.Since(currentTime)
+		minrLog.Tracef("DEV #%d: Kernel execution to read time: %v", d.index,
+			elapsedTime)
+	}
+}
diff --git a/compat.h b/compat.h
new file mode 100644
index 0000000..a98dab9
--- /dev/null
+++ b/compat.h
@@ -0,0 +1,94 @@
+#ifndef __COMPAT_H__
+#define __COMPAT_H__
+
+#ifdef WIN32
+
+#include <windows.h>
+#include <time.h>
+
+#define localtime_r(src, dst) localtime_s(dst, src)
+
+static __inline void sleep(int secs)
+{
+	Sleep(secs * 1000);
+}
+
+enum {
+	PRIO_PROCESS = 0,
+};
+
+extern int opt_priority;
+
+static __inline int setpriority(int which, int who, int prio)
+{
+	switch (opt_priority) {
+		case 5:
+			prio = THREAD_PRIORITY_TIME_CRITICAL;
+			break;
+		case 4:
+			prio = THREAD_PRIORITY_HIGHEST;
+			break;
+		case 3:
+			prio = THREAD_PRIORITY_ABOVE_NORMAL;
+			break;
+		case 2:
+			prio = THREAD_PRIORITY_NORMAL;
+			break;
+		case 1:
+			prio = THREAD_PRIORITY_BELOW_NORMAL;
+			break;
+		case 0:
+		default:
+			prio = THREAD_PRIORITY_IDLE;
+	}
+	return -!SetThreadPriority(GetCurrentThread(), prio);
+}
+
+#ifdef _MSC_VER
+#define snprintf(...) _snprintf(__VA_ARGS__)
+#define strdup(...) _strdup(__VA_ARGS__)
+#define strncasecmp(x,y,z) _strnicmp(x,y,z)
+#define strcasecmp(x,y) _stricmp(x,y)
+typedef int ssize_t;
+
+__inline int msver(void) {
+	switch (_MSC_VER) {
+	case 1500: return 2008;
+	case 1600: return 2010;
+	case 1700: return 2012;
+	case 1800: return 2013;
+	case 1900: return 2015;
+	default: return (_MSC_VER/100);
+	}
+}
+
+#include <stdlib.h>
+static __inline char * dirname(char *file) {
+	char buffer[_MAX_PATH] = { 0 };
+	char drive[_MAX_DRIVE];
+	char dir[_MAX_DIR];
+	char fname[_MAX_FNAME];
+	char ext[_MAX_EXT];
+	_splitpath_s(file, drive, _MAX_DRIVE, dir, _MAX_DIR, fname, _MAX_FNAME, ext, _MAX_EXT);
+	sprintf(buffer, "%s%s", drive, dir);
+	return strdup(buffer);
+}
+#endif
+
+#endif /* WIN32 */
+
+#ifdef _MSC_VER
+# define __func__ __FUNCTION__
+# define __thread __declspec(thread)
+# define _ALIGN(x) __declspec(align(x))
+#else
+# define _ALIGN(x) __attribute__ ((aligned(x)))
+/* dirname() for linux/mingw */
+#include <libgen.h>
+#endif
+
+#ifndef WIN32
+#define MAX_PATH PATH_MAX
+#endif
+
+#endif /* __COMPAT_H__ */
diff --git a/config.go b/config.go
index 0fb6a01..e417a70 100644
--- a/config.go
+++ b/config.go
@@ -39,8 +39,9 @@ var (
 )
 
 type config struct {
-	ListDevices bool `short:"l" long:"listdevices" description:"List number of devices."`
-	ShowVersion bool `short:"V" long:"version" description:"Display version information and exit"`
+	ListDevices   bool `short:"l" long:"listdevices" description:"List number of devices."`
+	ListCuDevices bool `long:"listcudadevices" description:"List number of CUDA devices."`
+	ShowVersion   bool `short:"V" long:"version" description:"Display version information and exit"`
 
 	// Config / log options
 	ConfigFile string `short:"C" long:"configfile" description:"Path to configuration file"`
@@ -69,6 +70,7 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
+	UseCuda           bool   `short:"U" long:"cuda" description:"Use CUDA if GPU supports it"`
 	Autocalibrate     string `short:"A" long:"autocalibrate" description:"Time target in milliseconds to spend executing hashes on the device during each iteration. Single global value or a comma separated list."`
 	AutocalibrateInts []int
 	Devices           string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
@@ -264,6 +266,11 @@ func loadConfig() (*config, []string, error) {
 		os.Exit(0)
 	}
 
+	if preCfg.ListCuDevices {
+		ListCuDevices()
+		os.Exit(0)
+	}
+
 	if preCfg.ShowVersion {
 		fmt.Println(appName, "version", version())
 		os.Exit(0)
diff --git a/cuda_helper.h b/cuda_helper.h
new file mode 100644
index 0000000..1358892
--- /dev/null
+++ b/cuda_helper.h
@@ -0,0 +1,685 @@
+#ifndef CUDA_HELPER_H
+#define CUDA_HELPER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef __INTELLISENSE__
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifndef UINT32_MAX
+/* slackware need that */
+#define UINT32_MAX UINT_MAX
+#endif
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+extern "C" short device_map[MAX_GPUS];
+extern "C"  long device_sm[MAX_GPUS];
+
+extern int cuda_arch[MAX_GPUS];
+
+// common functions
+extern int cuda_get_arch(int thr_id);
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_free(int thr_id);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+extern __device__ __device_builtin__ void __syncthreads(void);
+extern __device__ __device_builtin__ void __threadfence(void);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+#ifndef SPH_C32
+#define SPH_C32(x) (x)
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) (x)
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#if __CUDA_ARCH__ < 320
+// Host and Compute 3.0
+#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define __ldg(x) (*(x))
+#else
+// Compute 3.2+
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+{
+#if __CUDA_ARCH__ >= 130
+	return __double_as_longlong(__hiloint2double(HI, LO));
+#else
+	return (uint64_t)LO | (((uint64_t)HI) << 32);
+#endif
+}
+
+// das Hi Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
+}
+
+// das Lo Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+// Endian Drehung f�r 32 Bit Typen
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
+// das Lo Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2loint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x & 0xFFFFFFFFULL);
+#endif
+}
+
+// das Hi Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2hiint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x >> 32);
+#endif
+}
+
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
+{
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint64_t result;
+	//result = __byte_perm((uint32_t) x, 0, 0x0123);
+	//return (result << 32) + __byte_perm(_HIDWORD(x), 0, 0x0123);
+	asm("{ .reg .b32 x, y; // swab64\n\t"
+		"mov.b64 {x,y}, %1;\n\t"
+		"prmt.b32 x, x, 0, 0x0123;\n\t"
+		"prmt.b32 y, y, 0, 0x0123;\n\t"
+		"mov.b64 %0, {y,x};\n\t"
+	"}\n" : "=l"(result): "l"(x));
+	return result;
+}
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif
+
+// swap two uint32_t without extra registers
+__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) {
+	x ^= y; y = x ^ y; x ^= y;
+}
+// for other types...
+#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; }
+
+/*********************************************************************/
+// Macros to catch CUDA errors in CUDA runtime calls
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
+/*********************************************************************/
+#if !defined(__CUDA_ARCH__) || defined(_WIN64)
+#define USE_XOR_ASM_OPTS 0
+#else
+#define USE_XOR_ASM_OPTS 1
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(uint64_t a, uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2; // xor1" : "=l"(result) : "l"(a), "l"(b));
+	return result;
+}
+#else
+#define xor1(a,b) (a ^ b)
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %2, %3; // xor3\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+#else
+#define xor3(a,b,c) (a ^ b ^ c)
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+#else
+#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
+#endif
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("{ // xandx \n\t"
+		".reg .u64 n;\n\t"
+		"xor.b64 %0, %2, %3;\n\t"
+		"and.b64 n, %0, %1;\n\t"
+		"xor.b64 %0, n, %3;\n\t"
+	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+#else
+	return ((b^c) & a) ^ c;
+#endif
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("{ // andor\n\t"
+		".reg .u64 m,n;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m;\n\t"
+	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+#else
+	return ((a | b) & c) | (a & b);
+#endif
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("shr.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+#else
+	return x >> n;
+#endif
+}
+
+__device__ __forceinline__
+uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("shl.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+#else
+	return x << n;
+#endif
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x,uint32_t n) {
+#ifdef __CUDA_ARCH__
+	uint32_t result;
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+	return result;
+#else
+	return x >> n;
+#endif
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x,uint32_t n) {
+#ifdef __CUDA_ARCH__
+	uint32_t result;
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+	return result;
+#else
+	return x << n;
+#endif
+}
+
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 1
+#endif
+
+// 64-bit ROTATE RIGHT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{ // ROTR64 \n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shr.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shl.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, %0, lhs;\n\t"
+	"}\n" : "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{ // ROTL64 \n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shl.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shr.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, lhs, %0;\n\t"
+	"}\n" : "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{ // ROTL64 \n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}\n" : "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+#else
+/* host */
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+__device__ __forceinline__
+uint64_t SWAPDWORDS(uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
+/* lyra2/bmw - uint2 vector's operators */
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+#ifdef __CUDA_ARCH__
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+#else
+	lo = (uint32_t)(x);
+	hi = (uint32_t)(x >> 32);
+#endif
+}
+
+static __host__ __device__ __forceinline__ uint2 vectorize(uint64_t v) {
+	uint2 result;
+#ifdef __CUDA_ARCH__
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(v));
+#else
+	result.x = (uint32_t)(v);
+	result.y = (uint32_t)(v >> 32);
+#endif
+	return result;
+}
+
+static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
+#ifdef __CUDA_ARCH__
+	return MAKE_ULONGLONG(v.x, v.y);
+#else
+	return (((uint64_t)v.y) << 32) + v.x;
+#endif
+}
+
+/**
+ * uint2 direct ops by c++ operator definitions
+ */
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
+#ifdef __CUDA_ARCH__
+	uint2 result;
+	asm("{ // uint2 a+b \n\t"
+		"add.cc.u32 %0, %2, %4; \n\t"
+		"addc.u32   %1, %3, %5; \n\t"
+	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return vectorize(devectorize(a) + devectorize(b));
+#endif
+}
+static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+
+
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) {
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm("{ // uint2 a-b \n\t"
+		"sub.cc.u32 %0, %2, %4; \n\t"
+		"subc.u32   %1, %3, %5; \n\t"
+	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return vectorize(devectorize(a) - devectorize(b));
+#endif
+}
+static __device__ __forceinline__ void operator-= (uint2 &a, uint2 b) { a = a - b; }
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
+{
+#ifdef __CUDA_ARCH__
+	uint2 result;
+	asm("{ // uint2 a*b \n\t"
+		"mul.lo.u32       %0, %2, %4;  \n\t"
+		"mul.hi.u32       %1, %2, %4;  \n\t"
+		"mad.lo.cc.u32    %1, %3, %4, %1; \n\t"
+		"madc.lo.u32      %1, %3, %5, %1; \n\t"
+	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	// incorrect but unused host equiv
+	return make_uint2(a.x * b.x, a.y * b.y);
+#endif
+}
+
+// uint2 ROR/ROL methods
+__device__ __forceinline__
+uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	} else /* if (offset < 64) */ {
+		/* offset SHOULD BE < 64 ! */
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	if (!offset)
+		result = a;
+	else if (offset < 32) {
+		result.y = ((a.y >> offset) | (a.x << (32 - offset)));
+		result.x = ((a.x >> offset) | (a.y << (32 - offset)));
+	} else if (offset == 32) {
+		result.y = a.x;
+		result.x = a.y;
+	} else {
+		result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+		result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+	}
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 ROL2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	if (!offset)
+		result = a;
+	else
+		result = ROR2(a, 64 - offset);
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+/* Byte aligned Rotations (lyra2) */
+#ifdef __CUDA_ARCH__
+__device__ __inline__ uint2 ROL8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+	return result;
+}
+
+__device__ __inline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __inline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+#else
+#define ROL8(u)  ROL2(u, 8)
+#define ROR16(u) ROR2(u,16)
+#define ROR24(u) ROR2(u,24)
+#endif
+
+/* uint2 for bmw512 - to double check later */
+
+__device__ __forceinline__
+static uint2 SHL2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset < 32)  {
+		asm("{ // SHL2 (l) \n\t"
+			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
+			"shl.b32         %0, %2, %4;     \n\t"
+		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	} else {
+		asm("{ // SHL2 (h) \n\t"
+			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
+			"shl.b32         %0, %2, %4;     \n\t"
+		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+#else
+	if (offset <= 32) {
+		a.y = (a.y << offset) | (a.x >> (32 - offset));
+		a.x = (a.x << offset);
+	} else {
+		a.y = (a.x << (offset-32));
+		a.x = 0;
+	}
+	return a;
+#endif
+}
+
+__device__ __forceinline__
+static uint2 SHR2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset<32) {
+		asm("{\n\t"
+			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shr.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	} else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shl.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+#else
+	if (offset <= 32) {
+		a.x = (a.x >> offset) | (a.y << (32 - offset));
+		a.y = (a.y >> offset);
+	} else {
+		a.x = (a.y >> (offset - 32));
+		a.y = 0;
+	}
+	return a;
+#endif
+}
+
+#endif // #ifndef CUDA_HELPER_H
diff --git a/cudevice.go b/cudevice.go
new file mode 100644
index 0000000..6b8826d
--- /dev/null
+++ b/cudevice.go
@@ -0,0 +1,250 @@
+// Copyright (c) 2016 The Decred developers.
+
+package main
+
+/*
+#cgo LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/cuda.a
+#include <stdint.h>
+void decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads, uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh);
+void decred_cpu_setBlock_52(const uint32_t *input);
+*/
+import "C"
+import (
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"runtime"
+	"time"
+	"unsafe"
+
+	"github.com/mumax/3/cuda/cu"
+
+	"github.com/decred/gominer/util"
+	"github.com/decred/gominer/work"
+)
+
+const (
+	// From ccminer
+	threadsPerBlock = 640
+	blockx          = threadsPerBlock
+)
+
+func decredCPUSetBlock52(input *[192]byte) {
+	if input == nil {
+		panic("input is nil")
+	}
+	C.decred_cpu_setBlock_52((*C.uint32_t)(unsafe.Pointer(input)))
+}
+
+func decredHashNonce(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
+	C.decred_hash_nonce(C.uint32_t(gridx), C.uint32_t(blockx), C.uint32_t(threads),
+		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
+}
+
+func getCUInfo() ([]cu.Device, error) {
+	cu.Init(0)
+	ids := cu.DeviceGetCount()
+	minrLog.Infof("%v GPUs", ids)
+	var CUdevices []cu.Device
+	// XXX Do this more like ListCuDevices
+	for i := 0; i < ids; i++ {
+		dev := cu.DeviceGet(i)
+		CUdevices = append(CUdevices, dev)
+		minrLog.Infof("%v: %v", i, dev.Name())
+	}
+	return CUdevices, nil
+}
+
+// getCUDevices returns the list of devices for the given platform.
+func getCUDevices() ([]cu.Device, error) {
+	cu.Init(0)
+
+	version := cu.Version()
+	fmt.Println(version)
+
+	maj := version / 1000
+	min := version % 100
+
+	minMajor := 5
+	minMinor := 5
+
+	if maj < minMajor || (maj == minMajor && min < minMinor) {
+		return nil, fmt.Errorf("Driver does not suppoer CUDA %v.%v API", minMajor, minMinor)
+	}
+
+	var numDevices int
+	numDevices = cu.DeviceGetCount()
+	if numDevices < 1 {
+		return nil, fmt.Errorf("No devices found")
+	}
+	devices := make([]cu.Device, numDevices)
+	for i := 0; i < numDevices; i++ {
+		dev := cu.DeviceGet(i)
+		devices[i] = dev
+	}
+	return devices, nil
+}
+
+// ListCuDevices prints a list of CUDA capable GPUs present.
+func ListCuDevices() {
+	// CUDA devices
+	// Because mumux3/3/cuda/cu likes to panic instead of error.
+	defer func() {
+		if r := recover(); r != nil {
+			fmt.Println("No CUDA Capable GPUs present")
+		}
+	}()
+	devices, _ := getCUDevices()
+	for i, dev := range devices {
+		fmt.Printf("CUDA Capable GPU #%d: %s\n", i, dev.Name())
+	}
+}
+
+func NewCuDevice(index int, order int, deviceID cu.Device,
+	workDone chan []byte) (*Device, error) {
+
+	d := &Device{
+		index:      index,
+		cuDeviceID: deviceID,
+		deviceName: deviceID.Name(),
+		cuda:       true,
+		quit:       make(chan struct{}),
+		newWork:    make(chan *work.Work, 5),
+		workDone:   workDone,
+	}
+
+	d.cuInSize = 21
+
+	d.started = uint32(time.Now().Unix())
+
+	// Autocalibrate?
+
+	return d, nil
+
+}
+
+func (d *Device) runCuDevice() error {
+	// Bump the extraNonce for the device it's running on
+	// when you begin mining. This ensures each GPU is doing
+	// different work. If the extraNonce has already been
+	// set for valid work, restore that.
+	d.extraNonce += uint32(d.index) << 24
+	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+	// Need to have this stuff here for a ctx vs thread issue.
+	runtime.LockOSThread()
+
+	// Create the CU context
+	d.cuContext = cu.CtxCreate(cu.CTX_BLOCKING_SYNC, d.cuDeviceID)
+
+	// Allocate the input region
+	d.cuContext.SetCurrent()
+
+	// kernel is built with nvcc, not an api call so much bet done
+	// at compile time.
+
+	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
+	nonceResultsH := cu.MemAllocHost(d.cuInSize * 4)
+	nonceResultsD := cu.MemAlloc(d.cuInSize * 4)
+	defer cu.MemFreeHost(nonceResultsH)
+	defer nonceResultsD.Free()
+
+	nonceResultsHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(nonceResultsH),
+		Len:  int(d.cuInSize),
+		Cap:  int(d.cuInSize),
+	}
+	nonceResultsHSlice := *(*[]uint32)(unsafe.Pointer(&nonceResultsHSliceHeader))
+
+	endianData := new([192]byte)
+
+	for {
+		d.updateCurrentWork()
+
+		select {
+		case <-d.quit:
+			return nil
+		default:
+		}
+
+		// Increment extraNonce.
+		util.RolloverExtraNonce(&d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+		copy(endianData[:], d.work.Data[:128])
+		for i, j := 128, 0; i < 180; {
+			b := make([]byte, 4)
+			binary.BigEndian.PutUint32(b, d.lastBlock[j])
+			copy(endianData[i:], b)
+			i += 4
+			j++
+		}
+		decredCPUSetBlock52(endianData)
+
+		// Update the timestamp. Only solo work allows you to roll
+		// the timestamp.
+		ts := d.work.JobTime
+		if d.work.IsGetWork {
+			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+			ts = d.work.JobTime + diffSeconds
+		}
+		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+
+		nonceResultsHSlice[0] = 0
+
+		cu.MemcpyHtoD(nonceResultsD, nonceResultsH, d.cuInSize*4)
+
+		// Execute the kernel and follow its execution time.
+		currentTime := time.Now()
+
+		// TODO Which nonceword is this?  In ccminer it is &pdata[35]
+		startNonce := d.lastBlock[work.Nonce1Word]
+		//fmt.Printf("%p %v\n", &startNonce, startNonce)
+
+		throughput := uint32(0x20000000) // TODO
+		//throughput = minUint32(throughput, ^uint32(0)-nonce)
+		//gridx := int((throughput + threadsPerBlock - 1) / threadsPerBlock)
+		//gridx := (int(throughput) + 639) / 640
+		gridx := ((throughput - 1) / 640)
+
+		gridx = 52428 // don't ask me why this works.
+
+		targetHigh := ^uint32(0) // TODO
+
+		decredHashNonce(gridx, blockx, throughput, startNonce, nonceResultsD, targetHigh)
+
+		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, d.cuInSize*4)
+
+		numResults := nonceResultsHSlice[0]
+		for i, result := range nonceResultsHSlice[1 : 1+numResults] {
+			// lol seelog
+			i := i
+			result := result
+			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
+				"extraNonce %08x, workID %08x, timestamp %08x",
+				d.index, i, result, d.lastBlock[work.Nonce1Word],
+				util.Uint32EndiannessSwap(d.currentWorkID),
+				d.lastBlock[work.TimestampWord])
+
+			// Assess the work. If it's below target, it'll be rejected
+			// here. The mining algorithm currently sends this function any
+			// difficulty 1 shares.
+			d.foundCandidate(d.lastBlock[work.TimestampWord], result,
+				d.lastBlock[work.Nonce1Word])
+		}
+
+		elapsedTime := time.Since(currentTime)
+		minrLog.Tracef("GPU #%d: Kernel execution to read time: %v", d.index,
+			elapsedTime)
+	}
+
+	return nil
+}
+
+func minUint32(a, b uint32) uint32 {
+	if a > b {
+		return a
+	} else {
+		return b
+	}
+}
diff --git a/decred.cu b/decred.cu
new file mode 100644
index 0000000..c5cd420
--- /dev/null
+++ b/decred.cu
@@ -0,0 +1,480 @@
+/**
+ * Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2/6.1)
+ *
+ * Tanguy Pruvot - Feb 2016
+ *
+ * Merged 8-round blake (XVC) tweaks
+ * Further improved by: ~2.72%
+ * Alexis Provos - Jun 2016
+ */
+
+// nvcc  -I. -c decred.cu --ptx
+
+#include <stdint.h>
+#include <memory.h>
+#include <miner.h>
+
+extern "C" {
+#include <sph/sph_blake.h>
+}
+
+/* threads per block */
+#define TPB 640
+
+/* max count of found nonces in one call (like sgminer) */
+#define maxResults 4
+
+/* hash by cpu with blake 256 */
+extern "C" void decred_hash(void *output, const void *input)
+{
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 180);
+	sph_blake256_close(&ctx, output);
+}
+
+#include <cuda_helper.h>
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicInc(p, max) (*p)++
+#endif
+
+__constant__ uint32_t _ALIGN(16) c_h[2];
+__constant__ uint32_t _ALIGN(16) c_data[32];
+__constant__ uint32_t _ALIGN(16) c_xors[215];
+
+/* Buffers of candidate nonce(s) */
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+
+#define ROR8(a)  __byte_perm(a, 0, 0x0321)
+#define ROL16(a) __byte_perm(a, 0, 0x1032)
+
+/* macro bodies */
+#define pxorGS(a,b,c,d) { \
+	v[a]+= c_xors[i++] + v[b]; \
+	v[d] = ROL16(v[d] ^ v[a]); \
+	v[c]+= v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[a]+= c_xors[i++] + v[b]; \
+	v[d] = ROR8(v[d] ^ v[a]); \
+	v[c]+= v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+}
+
+#define pxorGS2(a,b,c,d, a1,b1,c1,d1) {\
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxory1GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxory0GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxorx1GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxorx0GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]); 	        v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b]; 			v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]); 	        v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7); 		v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+extern "C"
+{
+
+//__global__ __launch_bounds__(TPB,1)
+__global__ void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t highTarget)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (thread < threads)
+	{
+		uint32_t v[16];
+		#pragma unroll
+		for(int i=0; i<16; i+=4) {
+			*(uint4*)&v[i] = *(uint4*)&c_data[i];
+		}
+
+		const uint32_t nonce = startNonce + thread;
+		v[ 1]+= (nonce ^ 0x13198A2E);
+		v[13] = ROR8(v[13] ^ v[1]);
+		v[ 9]+= v[13];
+		v[ 5] = ROTR32(v[5] ^ v[9], 7);
+
+		int i = 0;
+		v[ 1]+= c_xors[i++];// + v[ 6];
+		v[ 0]+= v[5];
+		v[12] = ROL16(v[12] ^ v[ 1]);         v[15] = ROL16(v[15] ^ v[ 0]);
+		v[11]+= v[12];                        v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 12);    v[ 5] = ROTR32(v[5] ^ v[10], 12);
+		v[ 1]+= c_xors[i++] + v[ 6];          v[ 0]+= c_xors[i++] + v[ 5];
+		v[12] = ROR8(v[12] ^ v[ 1]);          v[15] = ROR8(v[15] ^ v[ 0]);
+		v[11]+= v[12];                        v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 7);     v[ 5] = ROTR32(v[ 5] ^ v[10], 7);
+
+		pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxory1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorx1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory0GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx0GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxory1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS(    2, 7, 8, 13);
+
+		if ((c_h[1]^v[15]) == v[7]) {
+			v[ 3] += c_xors[i++] + v[4];
+			v[14] = ROL16(v[14] ^ v[3]);
+			v[ 9] += v[14];
+			v[ 4] = ROTR32(v[4] ^ v[9], 12);
+			v[ 3] += c_xors[i++] + v[4];
+			v[14] = ROR8(v[14] ^ v[3]);
+			if(cuda_swab32((c_h[0]^v[6]^v[14])) <= highTarget) {
+				uint32_t pos = atomicInc(&resNonce[0], UINT32_MAX)+1;
+				resNonce[pos] = nonce;
+				return;
+			}
+		}
+	}
+}
+}
+
+extern "C" {
+void decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads, uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh)
+{
+	decred_gpu_hash_nonce <<<grid, block>>> (threads, startNonce, resNonce, targetHigh);
+}
+}
+
+extern "C" {
+__host__
+void decred_cpu_setBlock_52(const uint32_t *input)
+{
+	/*
+	for (int i = 0; i < 180/4; i++)
+		printf("%08x", input[i]);
+	printf("\n");
+	*/
+/*
+	Precompute everything possible and pass it on constant memory
+*/
+	const uint32_t z[16] = {
+		0x243F6A88U, 0x85A308D3U, 0x13198A2EU, 0x03707344U,
+		0xA4093822U, 0x299F31D0U, 0x082EFA98U, 0xEC4E6C89U,
+		0x452821E6U, 0x38D01377U, 0xBE5466CFU, 0x34E90C6CU,
+		0xC0AC29B7U, 0xC97C50DDU, 0x3F84D5B5U, 0xB5470917U
+	};
+
+	int i=0;
+	uint32_t _ALIGN(64) preXOR[215];
+	uint32_t _ALIGN(64)   data[16];
+	uint32_t _ALIGN(64)      m[16];
+	uint32_t _ALIGN(64)      h[ 2];
+
+	sph_blake256_context ctx;
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 128);
+
+	data[ 0] = ctx.H[0];
+	data[ 1] = ctx.H[1];
+	data[ 2] = ctx.H[2];
+	data[ 3] = ctx.H[3];
+	data[ 4] = ctx.H[4];
+	data[ 5] = ctx.H[5];
+	data[ 8] = ctx.H[6];
+
+	data[12] = swab32(input[35]);
+	data[13] = ctx.H[7];
+
+	// pre swab32
+	m[ 0] = swab32(input[32]);	m[ 1] = swab32(input[33]);
+	m[ 2] = swab32(input[34]);	m[ 3] = 0;
+	m[ 4] = swab32(input[36]);	m[ 5] = swab32(input[37]);
+	m[ 6] = swab32(input[38]);	m[ 7] = swab32(input[39]);
+	m[ 8] = swab32(input[40]);	m[ 9] = swab32(input[41]);
+	m[10] = swab32(input[42]);	m[11] = swab32(input[43]);
+	m[12] = swab32(input[44]);	m[13] = 0x80000001;
+	m[14] = 0;
+	m[15] = 0x000005a0;
+
+	h[ 0] = data[ 8];
+	h[ 1] = data[13];
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_h,h, 8, 0, cudaMemcpyHostToDevice));
+
+	data[ 0]+= (m[ 0] ^ z[1]) + data[ 4];
+	data[12]  = SPH_ROTR32(z[4] ^ SPH_C32(0x5A0) ^ data[ 0], 16);
+
+	data[ 8] = z[0]+data[12];
+	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 12);
+	data[ 0]+= (m[ 1] ^ z[0]) + data[ 4];
+	data[12] = SPH_ROTR32(data[12] ^ data[ 0],8);
+	data[ 8]+= data[12];
+	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 7);
+
+	data[ 1]+= (m[ 2] ^ z[3]) + data[ 5];
+	data[13] = SPH_ROTR32((z[5] ^ SPH_C32(0x5A0)) ^ data[ 1], 16);
+	data[ 9] = z[1]+data[13];
+	data[ 5] = SPH_ROTR32(data[ 5] ^ data[ 9], 12);
+	data[ 1]+= data[ 5]; //+nonce ^ ...
+
+	data[ 2]+= (m[ 4] ^ z[5]) + h[ 0];
+	data[14] = SPH_ROTR32(z[6] ^ data[ 2],16);
+	data[10] = z[2] + data[14];
+	data[ 6] = SPH_ROTR32(h[ 0] ^ data[10], 12);
+	data[ 2]+= (m[ 5] ^ z[4]) + data[ 6];
+	data[14] = SPH_ROTR32(data[14] ^ data[ 2], 8);
+	data[10]+= data[14];
+	data[ 6] = SPH_ROTR32(data[ 6] ^ data[10], 7);
+
+	data[ 3]+= (m[ 6] ^ z[7]) + h[ 1];
+	data[15] = SPH_ROTR32(z[7] ^ data[ 3],16);
+	data[11] = z[3] + data[15];
+	data[ 7] = SPH_ROTR32(h[ 1] ^ data[11], 12);
+	data[ 3]+= (m[ 7] ^ z[6]) + data[ 7];
+	data[15] = SPH_ROTR32(data[15] ^ data[ 3],8);
+	data[11]+= data[15];
+	data[ 7] = SPH_ROTR32(data[11] ^ data[ 7], 7);
+	data[ 0]+= m[ 8] ^ z[9];
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, 64, 0, cudaMemcpyHostToDevice));
+
+#define precalcXORGS(x,y) { \
+	preXOR[i++]= (m[x] ^ z[y]); \
+	preXOR[i++]= (m[y] ^ z[x]); \
+}
+#define precalcXORGS2(x,y,x1,y1){\
+	preXOR[i++] = (m[ x] ^ z[ y]);\
+	preXOR[i++] = (m[x1] ^ z[y1]);\
+	preXOR[i++] = (m[ y] ^ z[ x]);\
+	preXOR[i++] = (m[y1] ^ z[x1]);\
+}
+	precalcXORGS(10,11);
+	preXOR[ 0]+=data[ 6];
+	preXOR[i++] = (m[9] ^ z[8]);
+	precalcXORGS2(12,13,14,15);
+	precalcXORGS2(14,10, 4, 8);
+	precalcXORGS2( 9,15,13, 6);
+	precalcXORGS2( 1,12, 0, 2);
+	precalcXORGS2(11, 7, 5, 3);
+	precalcXORGS2(11, 8,12, 0);
+	precalcXORGS2( 5, 2,15,13);
+	precalcXORGS2(10,14, 3, 6);
+	precalcXORGS2( 7, 1, 9, 4);
+	precalcXORGS2( 7, 9, 3, 1);
+	precalcXORGS2(13,12,11,14);
+	precalcXORGS2( 2, 6, 5,10);
+	precalcXORGS2( 4, 0,15, 8);
+	precalcXORGS2( 9, 0, 5, 7);
+	precalcXORGS2( 2, 4,10,15);
+	precalcXORGS2(14, 1,11,12);
+	precalcXORGS2( 6, 8, 3,13);
+	precalcXORGS2( 2,12, 6,10);
+	precalcXORGS2( 0,11, 8, 3);
+	precalcXORGS2( 4,13, 7, 5);
+	precalcXORGS2(15,14, 1, 9);
+	precalcXORGS2(12, 5, 1,15);
+	precalcXORGS2(14,13, 4,10);
+	precalcXORGS2( 0, 7, 6, 3);
+	precalcXORGS2( 9, 2, 8,11);
+	precalcXORGS2(13,11, 7,14);
+	precalcXORGS2(12, 1, 3, 9);
+	precalcXORGS2( 5, 0,15, 4);
+	precalcXORGS2( 8, 6, 2,10);
+	precalcXORGS2( 6,15,14, 9);
+	precalcXORGS2(11, 3, 0, 8);
+	precalcXORGS2(12, 2,13, 7);
+	precalcXORGS2( 1, 4,10, 5);
+	precalcXORGS2(10, 2, 8, 4);
+	precalcXORGS2( 7, 6, 1, 5);
+	precalcXORGS2(15,11, 9,14);
+	precalcXORGS2( 3,12,13, 0);
+	precalcXORGS2( 0, 1, 2, 3);
+	precalcXORGS2( 4, 5, 6, 7);
+	precalcXORGS2( 8, 9,10,11);
+	precalcXORGS2(12,13,14,15);
+	precalcXORGS2(14,10, 4, 8);
+	precalcXORGS2( 9,15,13, 6);
+	precalcXORGS2( 1,12, 0, 2);
+	precalcXORGS2(11, 7, 5, 3);
+	precalcXORGS2(11, 8,12, 0);
+	precalcXORGS2( 5, 2,15,13);
+	precalcXORGS2(10,14, 3, 6);
+	precalcXORGS2( 7, 1, 9, 4);
+	precalcXORGS2( 7, 9, 3, 1);
+	precalcXORGS2(13,12,11,14);
+	precalcXORGS2( 2, 6, 5,10);
+	precalcXORGS( 4, 0);
+	precalcXORGS(15, 8);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_xors, preXOR, 215*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+}
+
+/* ############################################################################################################################### */
+
+static bool init[MAX_GPUS] = { 0 };
+
+// nonce position is different in decred
+#define DCR_NONCE_OFT32 35
+
+#if 0
+extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[48];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t *pnonce = &pdata[DCR_NONCE_OFT32];
+
+	const uint32_t first_nonce = *pnonce;
+	const uint32_t targetHigh = opt_benchmark ? 0x1ULL : ptarget[6];
+
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 29 : 25;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	const dim3 grid((throughput + TPB-1)/(TPB));
+	const dim3 block(TPB);
+
+	if (!init[thr_id]){
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+	memcpy(endiandata, pdata, 180);
+
+	decred_cpu_setBlock_52(endiandata);
+	h_resNonce[thr_id][0] = 1;
+
+	do {
+		if (h_resNonce[thr_id][0])
+			cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
+
+		// GPU HASH
+		decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
+		cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		if (h_resNonce[thr_id][0])
+		{
+			cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], (h_resNonce[thr_id][0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+			for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++)
+			{
+				uint32_t _ALIGN(64) vhash[8];
+				be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]);
+				decred_hash(vhash, endiandata);
+				if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
+				{
+					int rc = 1;
+					work_set_target_ratio(work, vhash);
+					*hashes_done = (*pnonce) - first_nonce + throughput;
+					work->nonces[0] = swab32(h_resNonce[thr_id][i]);
+					// search for another nonce
+					for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++)
+					{
+						be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]);
+						decred_hash(vhash, endiandata);
+						if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)){
+							work->nonces[1] = swab32(h_resNonce[thr_id][j]);
+							if(!opt_quiet)
+								gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]);
+							if(bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
+								work_set_target_ratio(work, vhash);
+								xchg(work->nonces[1], work->nonces[0]);
+							}
+							rc = 2;
+							break;
+						}
+					}
+					*pnonce = work->nonces[0];
+					return rc;
+				} else {
+					gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", i, h_resNonce[thr_id][i]);
+				}
+			}
+		}
+		*pnonce += throughput;
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + (*pnonce));
+
+	*hashes_done = (*pnonce) - first_nonce;
+	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
+	return 0;
+}
+#endif
+
+// cleanup
+extern "C" void free_decred(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/device.go b/device.go
index 1a29f16..d7c6146 100644
--- a/device.go
+++ b/device.go
@@ -8,13 +8,14 @@ import (
 	"encoding/hex"
 	"fmt"
 	"io"
-	"math"
 	"math/big"
 	"os"
 	"sync"
 	"time"
 	"unsafe"
 
+	"github.com/mumax/3/cuda/cu"
+
 	"github.com/decred/dcrd/blockchain"
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
@@ -26,46 +27,16 @@ import (
 )
 
 const (
-	outputBufferSize = cl.CL_size_t(64)
-	localWorksize    = 64
-	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
+	outputBufferSize   = cl.CL_size_t(64)
+	localWorksize      = 64
+	uint32Size         = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
+	cuOutputBufferSize = 64
 )
 
 var chainParams = &chaincfg.MainNetParams
 
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
 
-func getCLPlatforms() ([]cl.CL_platform_id, error) {
-	var numPlatforms cl.CL_uint
-	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetPlatformIDs")
-	}
-	platforms := make([]cl.CL_platform_id, numPlatforms)
-	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetPlatformIDs")
-	}
-	return platforms, nil
-}
-
-// getCLDevices returns the list of devices for the given platform.
-func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
-	var numDevices cl.CL_uint
-	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
-		&numDevices)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetDeviceIDs")
-	}
-	devices := make([]cl.CL_device_id, numDevices)
-	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
-		devices, nil)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLGetDeviceIDs")
-	}
-	return devices, nil
-}
-
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	var programBuffer [1][]byte
 	var programSize [1]cl.CL_size_t
@@ -96,7 +67,10 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 
 type Device struct {
 	sync.Mutex
-	index        int
+	index int
+	cuda  bool
+
+	// Items for OpenCL device
 	platformID   cl.CL_platform_id
 	deviceID     cl.CL_device_id
 	deviceName   string
@@ -106,6 +80,13 @@ type Device struct {
 	program      cl.CL_program
 	kernel       cl.CL_kernel
 
+	// Items for CUDA device
+	cuDeviceID cu.Device
+	cuContext  cu.Context
+	//cuInput        cu.DevicePtr
+	cuInSize       int64
+	cuOutputBuffer []float64
+
 	workSize uint32
 
 	// extraNonce is the device extraNonce, where the first
@@ -141,182 +122,18 @@ func clError(status cl.CL_int, f string) error {
 		cl.ERROR_CODES_STRINGS[-status], status)
 }
 
-// ListDevices prints a list of devices present.
-func ListDevices() {
-	platformIDs, err := getCLPlatforms()
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Could not get CL platforms: %v\n", err)
-		os.Exit(1)
-	}
-
-	deviceListIndex := 0
-	for i := range platformIDs {
-		platformID := platformIDs[i]
-		deviceIDs, err := getCLDevices(platformID)
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
-			os.Exit(1)
-		}
-
-		for _, deviceID := range deviceIDs {
-			fmt.Printf("DEV #%d: %s\n", deviceListIndex, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
-			deviceListIndex++
-		}
-	}
-}
-
-func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
-	workDone chan []byte) (*Device, error) {
-	d := &Device{
-		index:      index,
-		platformID: platformID,
-		deviceID:   deviceID,
-		deviceName: getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
-		quit:       make(chan struct{}),
-		newWork:    make(chan *work.Work, 5),
-		workDone:   workDone,
-	}
-
-	var status cl.CL_int
-
-	// Create the CL context.
-	d.context = cl.CLCreateContext(nil, 1, []cl.CL_device_id{deviceID},
-		nil, nil, &status)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLCreateContext")
-	}
-
-	// Create the command queue.
-	d.queue = cl.CLCreateCommandQueue(d.context, deviceID, 0, &status)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLCreateCommandQueue")
-	}
-
-	// Create the output buffer.
-	d.outputBuffer = cl.CLCreateBuffer(d.context, cl.CL_MEM_READ_WRITE,
-		uint32Size*outputBufferSize, nil, &status)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLCreateBuffer")
-	}
-
-	// Load kernel source.
-	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
-	if err != nil {
-		return nil, fmt.Errorf("Could not load kernel source: %v", err)
-	}
-
-	// Create the program.
-	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:],
-		progSize[:], &status)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLCreateProgramWithSource")
-	}
-
-	// Build the program for the device.
-	compilerOptions := ""
-	compilerOptions += fmt.Sprintf(" -D WORKSIZE=%d", localWorksize)
-	status = cl.CLBuildProgram(d.program, 1, []cl.CL_device_id{deviceID},
-		[]byte(compilerOptions), nil, nil)
-	if status != cl.CL_SUCCESS {
-		err = clError(status, "CLBuildProgram")
-
-		// Something went wrong! Print what it is.
-		var logSize cl.CL_size_t
-		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
-			cl.CL_PROGRAM_BUILD_LOG, 0, nil, &logSize)
-		if status != cl.CL_SUCCESS {
-			minrLog.Errorf("Could not obtain compilation error log: %v",
-				clError(status, "CLGetProgramBuildInfo"))
-		}
-		var programLog interface{}
-		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
-			cl.CL_PROGRAM_BUILD_LOG, logSize, &programLog, nil)
-		if status != cl.CL_SUCCESS {
-			minrLog.Errorf("Could not obtain compilation error log: %v",
-				clError(status, "CLGetProgramBuildInfo"))
-		}
-		minrLog.Errorf("%s\n", programLog)
-
-		return nil, err
-	}
-
-	// Create the kernel.
-	d.kernel = cl.CLCreateKernel(d.program, []byte("search"), &status)
-	if status != cl.CL_SUCCESS {
-		return nil, clError(status, "CLCreateKernel")
-	}
-
-	d.started = uint32(time.Now().Unix())
-
-	// Autocalibrate the desired work size for the kernel, or use one of the
-	// values passed explicitly by the use.
-	// The intensity or worksize must be set by the user.
-	userSetWorkSize := false
-	if len(cfg.IntensityInts) > 0 || len(cfg.WorkSizeInts) > 0 {
-		userSetWorkSize = true
-	}
-
-	var globalWorkSize uint32
-	if !userSetWorkSize {
-		// Apply the first setting as a global setting
-		calibrateTime := cfg.AutocalibrateInts[0]
-
-		// Override with the per-device setting if it exists
-		for i := range cfg.AutocalibrateInts {
-			if i == order {
-				calibrateTime = cfg.AutocalibrateInts[i]
-			}
-		}
-
-		idealWorkSize, err := d.calcWorkSizeForMilliseconds(calibrateTime)
-		if err != nil {
-			return nil, err
-		}
-
-		minrLog.Debugf("Autocalibration successful, work size for %v"+
-			"ms per kernel execution on device %v determined to be %v",
-			calibrateTime, d.index, idealWorkSize)
-
-		globalWorkSize = idealWorkSize
+func (d *Device) Release() {
+	if d.cuda {
+		d.cuContext.SetCurrent()
+		//d.cuInput.Free()
+		cu.CtxDestroy(&d.cuContext)
 	} else {
-		if len(cfg.IntensityInts) > 0 {
-			// Apply the first setting as a global setting
-			globalWorkSize = 1 << uint32(cfg.IntensityInts[0])
-
-			// Override with the per-device setting if it exists
-			for i := range cfg.IntensityInts {
-				if i == order {
-					globalWorkSize = 1 << uint32(cfg.IntensityInts[order])
-				}
-			}
-		}
-		if len(cfg.WorkSizeInts) > 0 {
-			// Apply the first setting as a global setting
-			globalWorkSize = uint32(cfg.WorkSizeInts[0])
-
-			// Override with the per-device setting if it exists
-			for i := range cfg.WorkSizeInts {
-				if i == order {
-					globalWorkSize = uint32(cfg.WorkSizeInts[order])
-				}
-			}
-		}
+		cl.CLReleaseKernel(d.kernel)
+		cl.CLReleaseProgram(d.program)
+		cl.CLReleaseCommandQueue(d.queue)
+		cl.CLReleaseMemObject(d.outputBuffer)
+		cl.CLReleaseContext(d.context)
 	}
-
-	intensity := math.Log2(float64(globalWorkSize))
-	minrLog.Infof("DEV #%d: Work size set to %v ('intensity' %v)",
-		d.index, globalWorkSize, intensity)
-	d.workSize = globalWorkSize
-
-	return d, nil
-}
-
-func (d *Device) Release() {
-	cl.CLReleaseKernel(d.kernel)
-	cl.CLReleaseProgram(d.program)
-	cl.CLReleaseCommandQueue(d.queue)
-	cl.CLReleaseMemObject(d.outputBuffer)
-	cl.CLReleaseContext(d.context)
 }
 
 func (d *Device) updateCurrentWork() {
@@ -367,10 +184,12 @@ func (d *Device) updateCurrentWork() {
 }
 
 func (d *Device) Run() {
-	//d.testFoundCandidate()
-	//return
-
-	err := d.runDevice()
+	var err error
+	if d.cuda {
+		err = d.runCuDevice()
+	} else {
+		err = d.runDevice()
+	}
 	if err != nil {
 		minrLog.Errorf("Error on device: %v", err)
 	}
@@ -409,122 +228,6 @@ func (d *Device) testFoundCandidate() {
 	//stratum submit {"params": ["test", "76df", "0200000000a461f2e3014335", "5783c78e", "e38c6e00"], "id": 4, "method": "mining.submit"}
 }
 
-func (d *Device) runDevice() error {
-	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
-	outputData := make([]uint32, outputBufferSize)
-
-	// Bump the extraNonce for the device it's running on
-	// when you begin mining. This ensures each device is doing
-	// different work. If the extraNonce has already been
-	// set for valid work, restore that.
-	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
-
-	var status cl.CL_int
-	for {
-		d.updateCurrentWork()
-
-		select {
-		case <-d.quit:
-			return nil
-		default:
-		}
-
-		// Increment extraNonce.
-		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
-
-		// Update the timestamp. Only solo work allows you to roll
-		// the timestamp.
-		ts := d.work.JobTime
-		if d.work.IsGetWork {
-			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
-			ts = d.work.JobTime + diffSeconds
-		}
-		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
-
-		// arg 0: pointer to the buffer
-		obuf := d.outputBuffer
-		status = cl.CLSetKernelArg(d.kernel, 0,
-			cl.CL_size_t(unsafe.Sizeof(obuf)),
-			unsafe.Pointer(&obuf))
-		if status != cl.CL_SUCCESS {
-			return clError(status, "CLSetKernelArg")
-		}
-
-		// args 1..8: midstate
-		for i := 0; i < 8; i++ {
-			ms := d.midstate[i]
-			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1),
-				uint32Size, unsafe.Pointer(&ms))
-			if status != cl.CL_SUCCESS {
-				return clError(status, "CLSetKernelArg")
-			}
-		}
-
-		// args 9..20: lastBlock except nonce
-		i2 := 0
-		for i := 0; i < 12; i++ {
-			if i2 == work.Nonce0Word {
-				i2++
-			}
-			lb := d.lastBlock[i2]
-			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9),
-				uint32Size, unsafe.Pointer(&lb))
-			if status != cl.CL_SUCCESS {
-				return clError(status, "CLSetKernelArg")
-			}
-			i2++
-		}
-
-		// Clear the found count from the buffer
-		status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer,
-			cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]),
-			0, nil, nil)
-		if status != cl.CL_SUCCESS {
-			return clError(status, "CLEnqueueWriteBuffer")
-		}
-
-		// Execute the kernel and follow its execution time.
-		currentTime := time.Now()
-		var globalWorkSize [1]cl.CL_size_t
-		globalWorkSize[0] = cl.CL_size_t(d.workSize)
-		var localWorkSize [1]cl.CL_size_t
-		localWorkSize[0] = localWorksize
-		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
-			globalWorkSize[:], localWorkSize[:], 0, nil, nil)
-		if status != cl.CL_SUCCESS {
-			return clError(status, "CLEnqueueNDRangeKernel")
-		}
-
-		// Read the output buffer.
-		cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0,
-			uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0,
-			nil, nil)
-		if status != cl.CL_SUCCESS {
-			return clError(status, "CLEnqueueReadBuffer")
-		}
-
-		for i := uint32(0); i < outputData[0]; i++ {
-			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
-				"extraNonce %08x, workID %08x, timestamp %08x",
-				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				util.Uint32EndiannessSwap(d.currentWorkID),
-				d.lastBlock[work.TimestampWord])
-
-			// Assess the work. If it's below target, it'll be rejected
-			// here. The mining algorithm currently sends this function any
-			// difficulty 1 shares.
-			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
-				d.lastBlock[work.Nonce1Word])
-		}
-
-		elapsedTime := time.Since(currentTime)
-		minrLog.Tracef("DEV #%d: Kernel execution to read time: %v", d.index,
-			elapsedTime)
-	}
-}
-
 func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	d.Lock()
 	defer d.Unlock()
diff --git a/glide.lock b/glide.lock
index c474bb5..e616159 100644
--- a/glide.lock
+++ b/glide.lock
@@ -1,5 +1,5 @@
 hash: 91c7f7aacbc4b5f82b14c9de3212c07257421b40c37926f571c3bc79f19c6060
-updated: 2016-07-27T16:32:52.717630149-04:00
+updated: 2016-08-16T11:10:41.055231248-04:00
 imports:
 - name: github.com/btcsuite/btclog
   version: f96df2375f37300305f329b8e5258764b4f19a7f
@@ -45,4 +45,8 @@ imports:
   version: b0909d3f798b97a03c9e77023f97a5301a2a7900
   subpackages:
   - edwards25519
+- name: github.com/mumax/3
+  version: 9859625390900fa7029ce8cfdeb430f239e502d4
+  subpackages:
+  - cuda/cu
 testImports: []
diff --git a/miner.go b/miner.go
index 8cb0e0f..311d9a0 100644
--- a/miner.go
+++ b/miner.go
@@ -45,22 +45,19 @@ func NewMiner() (*Miner, error) {
 		m.pool = s
 	}
 
-	platformIDs, err := getCLPlatforms()
-	if err != nil {
-		return nil, fmt.Errorf("Could not get CL platforms: %v", err)
-	}
-
-	deviceListIndex := 0
-	deviceListEnabledCount := 0
-
-	for p := range platformIDs {
-		platformID := platformIDs[p]
-		CLdeviceIDs, err := getCLDevices(platformID)
+	if cfg.UseCuda {
+		CUdeviceIDs, err := getCUInfo()
 		if err != nil {
-			return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+			return nil, err
 		}
 
-		for _, CLdeviceID := range CLdeviceIDs {
+		deviceListIndex := 0
+		deviceListEnabledCount := 0
+
+		// XXX Can probably combine these bits with the opencl ones once
+		// I decide what to do about the types.
+
+		for _, CUDeviceID := range CUdeviceIDs {
 			miningAllowed := false
 
 			// Enforce device restrictions if they exist
@@ -75,7 +72,7 @@ func NewMiner() (*Miner, error) {
 			}
 
 			if miningAllowed {
-				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
+				newDevice, err := NewCuDevice(deviceListIndex, deviceListEnabledCount, CUDeviceID, m.workDone)
 				deviceListEnabledCount++
 				m.devices = append(m.devices, newDevice)
 				if err != nil {
@@ -84,10 +81,55 @@ func NewMiner() (*Miner, error) {
 			}
 			deviceListIndex++
 		}
-	}
 
-	if deviceListEnabledCount == 0 {
-		return nil, fmt.Errorf("No devices started")
+		if deviceListEnabledCount == 0 {
+			return nil, fmt.Errorf("No devices started")
+		}
+
+	} else {
+		platformIDs, err := getCLPlatforms()
+		if err != nil {
+			return nil, fmt.Errorf("Could not get CL platforms: %v", err)
+		}
+
+		deviceListIndex := 0
+		deviceListEnabledCount := 0
+
+		for p := range platformIDs {
+			platformID := platformIDs[p]
+			CLdeviceIDs, err := getCLDevices(platformID)
+			if err != nil {
+				return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+			}
+
+			for _, CLdeviceID := range CLdeviceIDs {
+				miningAllowed := false
+
+				// Enforce device restrictions if they exist
+				if len(cfg.DeviceIDs) > 0 {
+					for _, i := range cfg.DeviceIDs {
+						if deviceListIndex == i {
+							miningAllowed = true
+						}
+					}
+				} else {
+					miningAllowed = true
+				}
+				if miningAllowed {
+					newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
+					deviceListEnabledCount++
+					m.devices = append(m.devices, newDevice)
+					if err != nil {
+						return nil, err
+					}
+					deviceListIndex++
+				}
+			}
+
+			if deviceListEnabledCount == 0 {
+				return nil, fmt.Errorf("No devices started")
+			}
+		}
 	}
 
 	m.started = uint32(time.Now().Unix())
diff --git a/miner.h b/miner.h
new file mode 100644
index 0000000..b68e831
--- /dev/null
+++ b/miner.h
@@ -0,0 +1,739 @@
+#ifndef __MINER_H__
+#define __MINER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#include <ccminer-config.h>
+
+#include <stdbool.h>
+#include <inttypes.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <curl/curl.h>
+
+#ifdef _MSC_VER
+#undef HAVE_ALLOCA_H
+#undef HAVE_SYSLOG_H
+#endif
+
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+
+#ifdef HAVE_ALLOCA_H
+# include <alloca.h>
+#elif !defined alloca
+# ifdef __GNUC__
+#  define alloca __builtin_alloca
+# elif defined _AIX
+#  define alloca __alloca
+# elif defined _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# elif !defined HAVE_ALLOCA
+void *alloca (size_t);
+# endif
+#endif
+
+#include "compat.h"
+
+#ifdef __INTELLISENSE__
+/* should be in stdint.h but... */
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int16 int8_t;
+typedef unsigned __int16 uint8_t;
+
+typedef unsigned __int32 time_t;
+typedef char *  va_list;
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
+# undef _ALIGN
+# define _ALIGN(x) __align__(x)
+#endif
+
+#ifdef HAVE_SYSLOG_H
+#include <syslog.h>
+#define LOG_BLUE 0x10
+#define LOG_RAW  0x99
+#else
+enum {
+	LOG_ERR,
+	LOG_WARNING,
+	LOG_NOTICE,
+	LOG_INFO,
+	LOG_DEBUG,
+	/* custom notices */
+	LOG_BLUE = 0x10,
+	LOG_RAW  = 0x99
+};
+#endif
+
+typedef unsigned char uchar;
+
+#undef unlikely
+#undef likely
+#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
+#define unlikely(expr) (__builtin_expect(!!(expr), 0))
+#define likely(expr) (__builtin_expect(!!(expr), 1))
+#else
+#define unlikely(expr) (expr)
+#define likely(expr) (expr)
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+#ifndef max
+# define max(a, b)  ((a) > (b) ? (a) : (b))
+#endif
+#ifndef min
+# define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef UINT32_MAX
+/* for gcc 4.4 */
+#define UINT32_MAX UINT_MAX
+#endif
+
+static inline bool is_windows(void) {
+#ifdef WIN32
+        return 1;
+#else
+        return 0;
+#endif
+}
+
+#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+#define WANT_BUILTIN_BSWAP
+#else
+#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#define bswap_64(x) (((uint64_t) bswap_32((uint32_t)((x) & 0xffffffffu)) << 32) \
+                   | (uint64_t) bswap_32((uint32_t)((x) >> 32)))
+#endif
+
+static inline uint32_t swab32(uint32_t v)
+{
+#ifdef WANT_BUILTIN_BSWAP
+	return __builtin_bswap32(v);
+#else
+	return bswap_32(v);
+#endif
+}
+
+static inline uint64_t swab64(uint64_t v)
+{
+#ifdef WANT_BUILTIN_BSWAP
+	return __builtin_bswap64(v);
+#else
+	return bswap_64(v);
+#endif
+}
+
+static inline void swab256(void *dest_p, const void *src_p)
+{
+	uint32_t *dest = (uint32_t *) dest_p;
+	const uint32_t *src = (const uint32_t *) src_p;
+
+	dest[0] = swab32(src[7]);
+	dest[1] = swab32(src[6]);
+	dest[2] = swab32(src[5]);
+	dest[3] = swab32(src[4]);
+	dest[4] = swab32(src[3]);
+	dest[5] = swab32(src[2]);
+	dest[6] = swab32(src[1]);
+	dest[7] = swab32(src[0]);
+}
+
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+#if !HAVE_DECL_BE32DEC
+static inline uint32_t be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_LE32DEC
+static inline uint32_t le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_BE32ENC
+static inline void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_LE32ENC
+static inline void le32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_BE16DEC
+static inline uint16_t be16dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint16_t)(p[1]) + ((uint16_t)(p[0]) << 8));
+}
+#endif
+
+#if !HAVE_DECL_BE16ENC
+static inline void be16enc(void *pp, uint16_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[1] = x & 0xff;
+	p[0] = (x >> 8) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_LE16DEC
+static inline uint16_t le16dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8));
+}
+#endif
+
+#if !HAVE_DECL_LE16ENC
+static inline void le16enc(void *pp, uint16_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+}
+#endif
+
+/* used for struct work */
+void *aligned_calloc(int size);
+void aligned_free(void *ptr);
+
+#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
+
+void sha256_init(uint32_t *state);
+void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
+void sha256d(unsigned char *hash, const unsigned char *data, int len);
+
+#define HAVE_SHA256_4WAY 0
+#define HAVE_SHA256_8WAY 0
+
+struct work;
+
+extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+
+/* free device allocated memory per algo */
+void algo_free_all(int thr_id);
+
+extern void free_decred(int thr_id);
+
+/* api related */
+void *api_thread(void *userdata);
+void api_set_throughput(int thr_id, uint32_t throughput);
+
+struct cgpu_info {
+	uint8_t gpu_id;
+	uint8_t thr_id;
+	int accepted;
+	int rejected;
+	int hw_errors;
+	double khashes;
+	uint8_t intensity_int;
+	uint8_t has_monitoring;
+	float gpu_temp;
+	uint16_t gpu_fan;
+	uint16_t gpu_fan_rpm;
+	uint16_t gpu_arch;
+	int gpu_clock;
+	int gpu_memclock;
+	size_t gpu_mem;
+	size_t gpu_memfree;
+	uint32_t gpu_power;
+	double gpu_vddc;
+	int16_t gpu_pstate;
+	int16_t gpu_bus;
+	uint16_t gpu_vid;
+	uint16_t gpu_pid;
+
+	int8_t nvml_id;
+	int8_t nvapi_id;
+
+	char gpu_sn[64];
+	char gpu_desc[64];
+	float intensity;
+	uint32_t throughput;
+};
+
+struct thr_api {
+	int id;
+	pthread_t pth;
+	struct thread_q	*q;
+};
+
+struct stats_data {
+	uint32_t uid;
+	uint32_t tm_stat;
+	uint32_t hashcount;
+	uint32_t height;
+
+	double difficulty;
+	double hashrate;
+
+	uint8_t thr_id;
+	uint8_t gpu_id;
+	uint8_t hashfound;
+	uint8_t ignored;
+
+	uint8_t npool;
+	uint8_t pool_type;
+	uint16_t align;
+};
+
+struct hashlog_data {
+	uint8_t npool;
+	uint8_t pool_type;
+	uint16_t align;
+
+	uint32_t height;
+	uint32_t njobid;
+	uint32_t nonce;
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+	uint32_t last_from;
+	uint32_t tm_add;
+	uint32_t tm_upd;
+	uint32_t tm_sent;
+};
+
+/* end of api */
+
+struct thr_info {
+	int		id;
+	pthread_t	pth;
+	struct thread_q	*q;
+	struct cgpu_info gpu;
+};
+
+struct work_restart {
+	/* volatile to modify accross threads (vstudio thing) */
+	volatile uint32_t restart;
+	char padding[128 - sizeof(uint32_t)];
+};
+
+#ifdef HAVE_GETOPT_LONG
+#include <getopt.h>
+#else
+struct option {
+	const char *name;
+	int has_arg;
+	int *flag;
+	int val;
+};
+#endif
+extern int options_count();
+
+extern bool opt_benchmark;
+extern bool opt_debug;
+extern bool opt_quiet;
+extern bool opt_protocol;
+extern bool opt_showdiff;
+extern bool opt_tracegpu;
+extern int opt_n_threads;
+extern int active_gpus;
+extern int gpu_threads;
+extern int opt_timeout;
+extern bool want_longpoll;
+extern bool have_longpoll;
+extern bool want_stratum;
+extern bool have_stratum;
+extern bool opt_stratum_stats;
+extern char *opt_cert;
+extern char *opt_proxy;
+extern long opt_proxy_type;
+extern bool use_syslog;
+extern bool use_colors;
+extern int use_pok;
+extern pthread_mutex_t applog_lock;
+extern struct thr_info *thr_info;
+extern int longpoll_thr_id;
+extern int stratum_thr_id;
+extern int api_thr_id;
+extern volatile bool abort_flag;
+extern struct work_restart *work_restart;
+extern bool opt_trust_pool;
+extern uint16_t opt_vote;
+
+extern uint64_t global_hashrate;
+extern uint64_t net_hashrate;
+extern double net_diff;
+extern double stratum_diff;
+
+#define MAX_GPUS 16
+//#define MAX_THREADS 32 todo
+extern char* device_name[MAX_GPUS];
+extern short device_map[MAX_GPUS];
+extern long  device_sm[MAX_GPUS];
+extern uint32_t gpus_intensity[MAX_GPUS];
+extern int opt_cudaschedule;
+
+// cuda.cpp
+int cuda_num_devices();
+void cuda_devicenames();
+void cuda_reset_device(int thr_id, bool *init);
+void cuda_shutdown();
+int cuda_finddevice(char *name);
+int cuda_version();
+void cuda_print_devices();
+int cuda_gpu_info(struct cgpu_info *gpu);
+int cuda_available_memory(int thr_id);
+
+uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
+#define device_intensity(t,f,d) cuda_default_throughput(t,d)
+
+void cuda_log_lasterror(int thr_id, const char* func, int line);
+void cuda_clear_lasterror();
+#define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__)
+
+#define CL_N    "\x1B[0m"
+#define CL_RED  "\x1B[31m"
+#define CL_GRN  "\x1B[32m"
+#define CL_YLW  "\x1B[33m"
+#define CL_BLU  "\x1B[34m"
+#define CL_MAG  "\x1B[35m"
+#define CL_CYN  "\x1B[36m"
+
+#define CL_BLK  "\x1B[22;30m" /* black */
+#define CL_RD2  "\x1B[22;31m" /* red */
+#define CL_GR2  "\x1B[22;32m" /* green */
+#define CL_YL2  "\x1B[22;33m" /* dark yellow */
+#define CL_BL2  "\x1B[22;34m" /* blue */
+#define CL_MA2  "\x1B[22;35m" /* magenta */
+#define CL_CY2  "\x1B[22;36m" /* cyan */
+#define CL_SIL  "\x1B[22;37m" /* gray */
+
+#ifdef WIN32
+#define CL_GRY  "\x1B[01;30m" /* dark gray */
+#else
+#define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
+#endif
+#define CL_LRD  "\x1B[01;31m" /* light red */
+#define CL_LGR  "\x1B[01;32m" /* light green */
+#define CL_LYL  "\x1B[01;33m" /* tooltips */
+#define CL_LBL  "\x1B[01;34m" /* light blue */
+#define CL_LMA  "\x1B[01;35m" /* light magenta */
+#define CL_LCY  "\x1B[01;36m" /* light cyan */
+
+#define CL_WHT  "\x1B[01;37m" /* white */
+
+extern void format_hashrate(double hashrate, char *output);
+extern void applog(int prio, const char *fmt, ...);
+extern void gpulog(int prio, int thr_id, const char *fmt, ...);
+void get_defconfig_path(char *out, size_t bufsize, char *argv0);
+extern void cbin2hex(char *out, const char *in, size_t len);
+extern char *bin2hex(const unsigned char *in, size_t len);
+extern bool hex2bin(void *output, const char *hexstr, size_t len);
+extern int timeval_subtract(struct timeval *result, struct timeval *x,
+	struct timeval *y);
+extern bool fulltest(const uint32_t *hash, const uint32_t *target);
+void diff_to_target(uint32_t* target, double diff);
+void work_set_target(struct work* work, double diff);
+double target_to_diff(uint32_t* target);
+extern void get_currentalgo(char* buf, int sz);
+
+// bignum
+double bn_convert_nbits(const uint32_t nbits);
+void bn_nbits_to_uchar(const uint32_t nBits, uchar *target);
+double bn_hash_target_ratio(uint32_t* hash, uint32_t* target);
+void bn_store_hash_target_ratio(uint32_t* hash, uint32_t* target, struct work* work);
+void work_set_target_ratio(struct work* work, uint32_t* hash);
+
+// bench
+extern int bench_algo;
+void bench_init(int threads);
+void bench_free();
+bool bench_algo_switch_next(int thr_id);
+void bench_set_throughput(int thr_id, uint32_t throughput);
+void bench_display_results();
+
+struct stratum_job {
+	char *job_id;
+	unsigned char prevhash[32];
+	size_t coinbase_size;
+	unsigned char *coinbase;
+	unsigned char *xnonce2;
+	int merkle_count;
+	unsigned char **merkle;
+	unsigned char version[4];
+	unsigned char nbits[4];
+	unsigned char ntime[4];
+	unsigned char claim[32]; // lbry
+	bool clean;
+	unsigned char nreward[2];
+	uint32_t height;
+	double diff;
+};
+
+struct stratum_ctx {
+	char *url;
+
+	CURL *curl;
+	char *curl_url;
+	char curl_err_str[CURL_ERROR_SIZE];
+	curl_socket_t sock;
+	size_t sockbuf_size;
+	char *sockbuf;
+
+	double next_diff;
+	double sharediff;
+
+	char *session_id;
+	size_t xnonce1_size;
+	unsigned char *xnonce1;
+	size_t xnonce2_size;
+	struct stratum_job job;
+
+	struct timeval tv_submit;
+	uint32_t answer_msec;
+	int pooln;
+	time_t tm_connected;
+
+	int srvtime_diff;
+};
+
+#define POK_MAX_TXS   4
+#define POK_MAX_TX_SZ 16384U
+struct tx {
+	uint8_t data[POK_MAX_TX_SZ];
+	uint32_t len;
+};
+
+struct work {
+	uint32_t data[48];
+	uint32_t target[8];
+	uint32_t maxvote;
+
+	char job_id[128];
+	size_t xnonce2_len;
+	uchar xnonce2[32];
+
+	union {
+		uint32_t u32[2];
+		uint64_t u64[1];
+	} noncerange;
+
+	uint32_t nonces[2];
+
+	double targetdiff;
+	double shareratio;
+	double sharediff;
+	uint32_t height;
+	uint8_t  pooln;
+
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+
+	/* pok getwork txs */
+	uint32_t tx_count;
+	struct tx txs[POK_MAX_TXS];
+};
+
+#define POK_BOOL_MASK 0x00008000
+#define POK_DATA_MASK 0xFFFF0000
+
+
+#define MAX_POOLS 8
+struct pool_infos {
+	uint8_t id;
+#define POOL_UNUSED   0
+#define POOL_GETWORK  1
+#define POOL_STRATUM  2
+#define POOL_LONGPOLL 4
+	uint8_t type;
+#define POOL_ST_DEFINED 1
+#define POOL_ST_VALID 2
+#define POOL_ST_DISABLED 4
+#define POOL_ST_REMOVED 8
+	uint16_t status;
+	int algo;
+	char name[64];
+	// credentials
+	char url[512];
+	char short_url[64];
+	char user[64];
+	char pass[384];
+	// config options
+	double max_diff;
+	double max_rate;
+	int shares_limit;
+	int time_limit;
+	int scantime;
+	// connection
+	struct stratum_ctx stratum;
+	uint8_t allow_gbt;
+	uint8_t allow_mininginfo;
+	uint16_t check_dups; // 16_t for align
+	int retries;
+	int fail_pause;
+	int timeout;
+	// stats
+	uint32_t work_time;
+	uint32_t wait_time;
+	uint32_t accepted_count;
+	uint32_t rejected_count;
+	uint32_t solved_count;
+	uint32_t stales_count;
+	time_t last_share_time;
+	double best_share;
+	uint32_t disconnects;
+};
+
+extern struct pool_infos pools[MAX_POOLS];
+extern int num_pools;
+extern volatile int cur_pooln;
+
+void pool_init_defaults(void);
+void pool_set_creds(int pooln);
+void pool_set_attr(int pooln, const char* key, char* arg);
+bool pool_switch_url(char *params);
+bool pool_switch(int thr_id, int pooln);
+bool pool_switch_next(int thr_id);
+int pool_get_first_valid(int startfrom);
+void pool_dump_infos(void);
+
+bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
+bool stratum_send_line(struct stratum_ctx *sctx, char *s);
+char *stratum_recv_line(struct stratum_ctx *sctx);
+bool stratum_connect(struct stratum_ctx *sctx, const char *url);
+void stratum_disconnect(struct stratum_ctx *sctx);
+bool stratum_subscribe(struct stratum_ctx *sctx);
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
+bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
+void stratum_free_job(struct stratum_ctx *sctx);
+
+void hashlog_remember_submit(struct work* work, uint32_t nonce);
+void hashlog_remember_scan_range(struct work* work);
+uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
+uint32_t hashlog_get_last_sent(char* jobid);
+uint64_t hashlog_get_scan_range(char* jobid);
+int  hashlog_get_history(struct hashlog_data *data, int max_records);
+void hashlog_purge_old(void);
+void hashlog_purge_job(char* jobid);
+void hashlog_purge_all(void);
+void hashlog_dump_job(char* jobid);
+void hashlog_getmeminfo(uint64_t *mem, uint32_t *records);
+
+void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height);
+double stats_get_speed(int thr_id, double def_speed);
+double stats_get_gpu_speed(int gpu_id);
+int  stats_get_history(int thr_id, struct stats_data *data, int max_records);
+void stats_purge_old(void);
+void stats_purge_all(void);
+void stats_getmeminfo(uint64_t *mem, uint32_t *records);
+
+struct thread_q;
+
+extern struct thread_q *tq_new(void);
+extern void tq_free(struct thread_q *tq);
+extern bool tq_push(struct thread_q *tq, void *data);
+extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
+extern void tq_freeze(struct thread_q *tq);
+extern void tq_thaw(struct thread_q *tq);
+
+#define EXIT_CODE_OK            0
+#define EXIT_CODE_USAGE         1
+#define EXIT_CODE_POOL_TIMEOUT  2
+#define EXIT_CODE_SW_INIT_ERROR 3
+#define EXIT_CODE_CUDA_NODEVICE 4
+#define EXIT_CODE_CUDA_ERROR    5
+#define EXIT_CODE_TIME_LIMIT    0
+#define EXIT_CODE_KILLED        7
+
+void parse_arg(int key, char *arg);
+void proper_exit(int reason);
+void restart_threads(void);
+
+size_t time2str(char* buf, time_t timer);
+char* atime2str(time_t timer);
+
+void applog_hex(void *data, int len);
+void applog_hash(void *hash);
+void applog_hash64(void *hash);
+void applog_compare_hash(void *hash, void *hash_ref);
+
+void print_hash_tests(void);
+void blake256hash(void *output, const void *input, int8_t rounds);
+void blake2s_hash(void *output, const void *input);
+void bmw_hash(void *state, const void *input);
+void c11hash(void *output, const void *input);
+void decred_hash(void *state, const void *input);
+void deephash(void *state, const void *input);
+void luffa_hash(void *state, const void *input);
+void fresh_hash(void *state, const void *input);
+void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
+void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+void keccak256_hash(void *state, const void *input);
+unsigned int jackpothash(void *state, const void *input);
+void groestlhash(void *state, const void *input);
+void lbry_hash(void *output, const void *input);
+void lyra2re_hash(void *state, const void *input);
+void lyra2v2_hash(void *state, const void *input);
+void myriadhash(void *state, const void *input);
+void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
+void nist5hash(void *state, const void *input);
+void pentablakehash(void *output, const void *input);
+void quarkhash(void *state, const void *input);
+void qubithash(void *state, const void *input);
+void scrypthash(void* output, const void* input);
+void scryptjane_hash(void* output, const void* input);
+void sibhash(void *output, const void *input);
+void skeincoinhash(void *output, const void *input);
+void skein2hash(void *output, const void *input);
+void s3hash(void *output, const void *input);
+void wcoinhash(void *state, const void *input);
+void whirlxHash(void *state, const void *input);
+void x11evo_hash(void *output, const void *input);
+void x11hash(void *output, const void *input);
+void x13hash(void *output, const void *input);
+void x14hash(void *output, const void *input);
+void x15hash(void *output, const void *input);
+void x17hash(void *output, const void *input);
+void zr5hash(void *output, const void *input);
+void zr5hash_pok(void *output, uint32_t *pdata);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __MINER_H__ */
diff --git a/sph/blake.c b/sph/blake.c
new file mode 100644
index 0000000..f2d6613
--- /dev/null
+++ b/sph/blake.c
@@ -0,0 +1,1133 @@
+//+build ignore
+
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_blake.h"
+
+int blake256_rounds = 14;
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_SMALL_FOOTPRINT_BLAKE   1
+#endif
+
+#if SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_COMPACT_BLAKE_32   1
+#endif
+
+#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
+#define SPH_COMPACT_BLAKE_64   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507),
+	SPH_C32(0x3070DD17), SPH_C32(0xF70E5939),
+	SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 IV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#endif
+
+#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+
+static const unsigned sigma[16][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
+};
+
+/*
+  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+ 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
+ 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
+  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
+  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
+  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
+ 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
+ 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
+  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
+ 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
+*/
+#endif
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#if SPH_COMPACT_BLAKE_32
+
+static const sph_u32 CS[16] = {
+	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+};
+
+#endif
+
+#if SPH_64
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#if SPH_COMPACT_BLAKE_64
+
+static const sph_u64 CB[16] = {
+	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+};
+
+#endif
+
+#endif
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define ROUND_S(r)   do { \
+		GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
+		GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
+		GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
+		GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
+		GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
+		GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
+		GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
+		GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T64(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR64(d ^ a, 32); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 25); \
+		a = SPH_T64(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR64(d ^ a, 16); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 11); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define ROUND_B(r)   do { \
+		GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
+		GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
+		GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
+		GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
+		GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
+		GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
+		GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
+		GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_B(r)   do { \
+		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#endif
+
+#define DECL_STATE32 \
+	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u32 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE32(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE32(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define COMPRESS32   do { \
+		sph_u32 M[16]; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M[0x0] = sph_dec32be_aligned(buf +  0); \
+		M[0x1] = sph_dec32be_aligned(buf +  4); \
+		M[0x2] = sph_dec32be_aligned(buf +  8); \
+		M[0x3] = sph_dec32be_aligned(buf + 12); \
+		M[0x4] = sph_dec32be_aligned(buf + 16); \
+		M[0x5] = sph_dec32be_aligned(buf + 20); \
+		M[0x6] = sph_dec32be_aligned(buf + 24); \
+		M[0x7] = sph_dec32be_aligned(buf + 28); \
+		M[0x8] = sph_dec32be_aligned(buf + 32); \
+		M[0x9] = sph_dec32be_aligned(buf + 36); \
+		M[0xA] = sph_dec32be_aligned(buf + 40); \
+		M[0xB] = sph_dec32be_aligned(buf + 44); \
+		M[0xC] = sph_dec32be_aligned(buf + 48); \
+		M[0xD] = sph_dec32be_aligned(buf + 52); \
+		M[0xE] = sph_dec32be_aligned(buf + 56); \
+		M[0xF] = sph_dec32be_aligned(buf + 60); \
+		for (r = 0; r < blake256_rounds; r ++) \
+			ROUND_S(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		if (blake256_rounds == 14) { \
+		ROUND_S(8); \
+		ROUND_S(9); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		} \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define DECL_STATE64 \
+	sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u64 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE64(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE64(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define COMPRESS64   do { \
+		sph_u64 M[16]; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M[0x0] = sph_dec64be_aligned(buf +   0); \
+		M[0x1] = sph_dec64be_aligned(buf +   8); \
+		M[0x2] = sph_dec64be_aligned(buf +  16); \
+		M[0x3] = sph_dec64be_aligned(buf +  24); \
+		M[0x4] = sph_dec64be_aligned(buf +  32); \
+		M[0x5] = sph_dec64be_aligned(buf +  40); \
+		M[0x6] = sph_dec64be_aligned(buf +  48); \
+		M[0x7] = sph_dec64be_aligned(buf +  56); \
+		M[0x8] = sph_dec64be_aligned(buf +  64); \
+		M[0x9] = sph_dec64be_aligned(buf +  72); \
+		M[0xA] = sph_dec64be_aligned(buf +  80); \
+		M[0xB] = sph_dec64be_aligned(buf +  88); \
+		M[0xC] = sph_dec64be_aligned(buf +  96); \
+		M[0xD] = sph_dec64be_aligned(buf + 104); \
+		M[0xE] = sph_dec64be_aligned(buf + 112); \
+		M[0xF] = sph_dec64be_aligned(buf + 120); \
+		for (r = 0; r < 16; r ++) \
+			ROUND_B(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS64   do { \
+		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M0 = sph_dec64be_aligned(buf +   0); \
+		M1 = sph_dec64be_aligned(buf +   8); \
+		M2 = sph_dec64be_aligned(buf +  16); \
+		M3 = sph_dec64be_aligned(buf +  24); \
+		M4 = sph_dec64be_aligned(buf +  32); \
+		M5 = sph_dec64be_aligned(buf +  40); \
+		M6 = sph_dec64be_aligned(buf +  48); \
+		M7 = sph_dec64be_aligned(buf +  56); \
+		M8 = sph_dec64be_aligned(buf +  64); \
+		M9 = sph_dec64be_aligned(buf +  72); \
+		MA = sph_dec64be_aligned(buf +  80); \
+		MB = sph_dec64be_aligned(buf +  88); \
+		MC = sph_dec64be_aligned(buf +  96); \
+		MD = sph_dec64be_aligned(buf + 104); \
+		ME = sph_dec64be_aligned(buf + 112); \
+		MF = sph_dec64be_aligned(buf + 120); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		ROUND_B(6); \
+		ROUND_B(7); \
+		ROUND_B(8); \
+		ROUND_B(9); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#endif
+
+static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+
+static void
+blake32_init(sph_blake_small_context *sc,
+	const sph_u32 *iv, const sph_u32 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake32(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake32_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf, 64);
+	}
+	out = (unsigned char *)dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
+#if SPH_64
+
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+
+static void
+blake64_init(sph_blake_big_context *sc,
+	const sph_u64 *iv, const sph_u64 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u64));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u64));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake64(sph_blake_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE64
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE64(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+				T1 = SPH_T64(T1 + 1);
+			COMPRESS64;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE64(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake64_close(sph_blake_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+	union {
+		unsigned char buf[128];
+		sph_u64 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u64 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+		sc->T1 = SPH_T64(sc->T1 - 1);
+	} else {
+		sc->T0 -= 1024 - bit_len;
+	}
+	if (bit_len <= 894) {
+		memset(u.buf + ptr + 1, 0, 111 - ptr);
+		if (out_size_w64 == 8)
+			u.buf[111] |= 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 127 - ptr);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+		memset(u.buf, 0, 112);
+		if (out_size_w64 == 8)
+			u.buf[111] = 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf, 128);
+	}
+	out = (unsigned char *)dst;
+	for (k = 0; k < out_size_w64; k ++)
+		sph_enc64be(out + (k << 3), sc->H[k]);
+}
+
+#endif
+
+/* see sph_blake.h */
+void
+sph_blake224_init(void *cc)
+{
+	blake32_init(cc, IV224, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_close(void *cc, void *dst)
+{
+	sph_blake224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 7);
+	sph_blake224_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_init(void *cc)
+{
+	blake32_init(cc, IV256, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_close(void *cc, void *dst)
+{
+	sph_blake256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 8);
+	sph_blake256_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_set_rounds(int rounds)
+{
+	blake256_rounds = rounds;
+}
+
+#if SPH_64
+
+/* see sph_blake.h */
+void
+sph_blake384_init(void *cc)
+{
+	blake64_init(cc, IV384, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_close(void *cc, void *dst)
+{
+	sph_blake384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 6);
+	sph_blake384_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_init(void *cc)
+{
+	blake64_init(cc, IV512, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_close(void *cc, void *dst)
+{
+	sph_blake512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 8);
+	sph_blake512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
new file mode 100644
index 0000000..2c2b3da
--- /dev/null
+++ b/sph/sph_blake.h
@@ -0,0 +1,337 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Switch for the number of rounds (old blake was 8)
+ */
+extern int blake256_rounds;
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Allow blakecoin and blake variants
+ */
+void sph_blake256_set_rounds(int rounds);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph/sph_types.h b/sph/sph_types.h
new file mode 100644
index 0000000..7295b0b
--- /dev/null
+++ b/sph/sph_types.h
@@ -0,0 +1,1976 @@
+/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
+/**
+ * Basic type definitions.
+ *
+ * This header file defines the generic integer types that will be used
+ * for the implementation of hash functions; it also contains helper
+ * functions which encode and decode multi-byte integer values, using
+ * either little-endian or big-endian conventions.
+ *
+ * This file contains a compile-time test on the size of a byte
+ * (the <code>unsigned char</code> C type). If bytes are not octets,
+ * i.e. if they do not have a size of exactly 8 bits, then compilation
+ * is aborted. Architectures where bytes are not octets are relatively
+ * rare, even in the embedded devices market. We forbid non-octet bytes
+ * because there is no clear convention on how octet streams are encoded
+ * on such systems.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_types.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_TYPES_H__
+#define SPH_TYPES_H__
+
+#include <limits.h>
+
+/*
+ * All our I/O functions are defined over octet streams. We do not know
+ * how to handle input data if bytes are not octets.
+ */
+#if CHAR_BIT != 8
+#error This code requires 8-bit bytes
+#endif
+
+/* ============= BEGIN documentation block for Doxygen ============ */
+
+#ifdef DOXYGEN_IGNORE
+
+/** @mainpage sphlib C code documentation
+ *
+ * @section overview Overview
+ *
+ * <code>sphlib</code> is a library which contains implementations of
+ * various cryptographic hash functions. These pages have been generated
+ * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
+ * document the API for the C implementations.
+ *
+ * The API is described in appropriate header files, which are available
+ * in the "Files" section. Each hash function family has its own header,
+ * whose name begins with <code>"sph_"</code> and contains the family
+ * name. For instance, the API for the RIPEMD hash functions is available
+ * in the header file <code>sph_ripemd.h</code>.
+ *
+ * @section principles API structure and conventions
+ *
+ * @subsection io Input/output conventions
+ *
+ * In all generality, hash functions operate over strings of bits.
+ * Individual bits are rarely encountered in C programming or actual
+ * communication protocols; most protocols converge on the ubiquitous
+ * "octet" which is a group of eight bits. Data is thus expressed as a
+ * stream of octets. The C programming language contains the notion of a
+ * "byte", which is a data unit managed under the type <code>"unsigned
+ * char"</code>. The C standard prescribes that a byte should hold at
+ * least eight bits, but possibly more. Most modern architectures, even
+ * in the embedded world, feature eight-bit bytes, i.e. map bytes to
+ * octets.
+ *
+ * Nevertheless, for some of the implemented hash functions, an extra
+ * API has been added, which allows the input of arbitrary sequences of
+ * bits: when the computation is about to be closed, 1 to 7 extra bits
+ * can be added. The functions for which this API is implemented include
+ * the SHA-2 functions and all SHA-3 candidates.
+ *
+ * <code>sphlib</code> defines hash function which may hash octet streams,
+ * i.e. streams of bits where the number of bits is a multiple of eight.
+ * The data input functions in the <code>sphlib</code> API expect data
+ * as anonymous pointers (<code>"const void *"</code>) with a length
+ * (of type <code>"size_t"</code>) which gives the input data chunk length
+ * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
+ * header contains a compile-time test which prevents compilation on
+ * architectures where this property is not met.
+ *
+ * The hash function output is also converted into bytes. All currently
+ * implemented hash functions have an output width which is a multiple of
+ * eight, and this is likely to remain true for new designs.
+ *
+ * Most hash functions internally convert input data into 32-bit of 64-bit
+ * words, using either little-endian or big-endian conversion. The hash
+ * output also often consists of such words, which are encoded into output
+ * bytes with a similar endianness convention. Some hash functions have
+ * been only loosely specified on that subject; when necessary,
+ * <code>sphlib</code> has been tested against published "reference"
+ * implementations in order to use the same conventions.
+ *
+ * @subsection shortname Function short name
+ *
+ * Each implemented hash function has a "short name" which is used
+ * internally to derive the identifiers for the functions and context
+ * structures which the function uses. For instance, MD5 has the short
+ * name <code>"md5"</code>. Short names are listed in the next section,
+ * for the implemented hash functions. In subsequent sections, the
+ * short name will be assumed to be <code>"XXX"</code>: replace with the
+ * actual hash function name to get the C identifier.
+ *
+ * Note: some functions within the same family share the same core
+ * elements, such as update function or context structure. Correspondingly,
+ * some of the defined types or functions may actually be macros which
+ * transparently evaluate to another type or function name.
+ *
+ * @subsection context Context structure
+ *
+ * Each implemented hash fonction has its own context structure, available
+ * under the type name <code>"sph_XXX_context"</code> for the hash function
+ * with short name <code>"XXX"</code>. This structure holds all needed
+ * state for a running hash computation.
+ *
+ * The contents of these structures are meant to be opaque, and private
+ * to the implementation. However, these contents are specified in the
+ * header files so that application code which uses <code>sphlib</code>
+ * may access the size of those structures.
+ *
+ * The caller is responsible for allocating the context structure,
+ * whether by dynamic allocation (<code>malloc()</code> or equivalent),
+ * static allocation (a global permanent variable), as an automatic
+ * variable ("on the stack"), or by any other mean which ensures proper
+ * structure alignment. <code>sphlib</code> code performs no dynamic
+ * allocation by itself.
+ *
+ * The context must be initialized before use, using the
+ * <code>sph_XXX_init()</code> function. This function sets the context
+ * state to proper initial values for hashing.
+ *
+ * Since all state data is contained within the context structure,
+ * <code>sphlib</code> is thread-safe and reentrant: several hash
+ * computations may be performed in parallel, provided that they do not
+ * operate on the same context. Moreover, a running computation can be
+ * cloned by copying the context (with a simple <code>memcpy()</code>):
+ * the context and its clone are then independant and may be updated
+ * with new data and/or closed without interfering with each other.
+ * Similarly, a context structure can be moved in memory at will:
+ * context structures contain no pointer, in particular no pointer to
+ * themselves.
+ *
+ * @subsection dataio Data input
+ *
+ * Hashed data is input with the <code>sph_XXX()</code> fonction, which
+ * takes as parameters a pointer to the context, a pointer to the data
+ * to hash, and the number of data bytes to hash. The context is updated
+ * with the new data.
+ *
+ * Data can be input in one or several calls, with arbitrary input lengths.
+ * However, it is best, performance wise, to input data by relatively big
+ * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
+ * optimize things and avoid internal copying.
+ *
+ * When all data has been input, the context can be closed with
+ * <code>sph_XXX_close()</code>. The hash output is computed and written
+ * into the provided buffer. The caller must take care to provide a
+ * buffer of appropriate length; e.g., when using SHA-1, the output is
+ * a 20-byte word, therefore the output buffer must be at least 20-byte
+ * long.
+ *
+ * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
+ * function can be used instead of <code>sph_XXX_close()</code>. This
+ * function can take a few extra <strong>bits</strong> to be added at
+ * the end of the input message. This allows hashing messages with a
+ * bit length which is not a multiple of 8. The extra bits are provided
+ * as an unsigned integer value, and a bit count. The bit count must be
+ * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
+ * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
+ * For instance, to add three bits of value 1, 1 and 0, the unsigned
+ * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
+ * will be 3.
+ *
+ * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
+ * it evaluates to the function output size, expressed in bits. For instance,
+ * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
+ *
+ * When closed, the context is automatically reinitialized and can be
+ * immediately used for another computation. It is not necessary to call
+ * <code>sph_XXX_init()</code> after a close. Note that
+ * <code>sph_XXX_init()</code> can still be called to "reset" a context,
+ * i.e. forget previously input data, and get back to the initial state.
+ *
+ * @subsection alignment Data alignment
+ *
+ * "Alignment" is a property of data, which is said to be "properly
+ * aligned" when its emplacement in memory is such that the data can
+ * be optimally read by full words. This depends on the type of access;
+ * basically, some hash functions will read data by 32-bit or 64-bit
+ * words. <code>sphlib</code> does not mandate such alignment for input
+ * data, but using aligned data can substantially improve performance.
+ *
+ * As a rule, it is best to input data by chunks whose length (in bytes)
+ * is a multiple of eight, and which begins at "generally aligned"
+ * addresses, such as the base address returned by a call to
+ * <code>malloc()</code>.
+ *
+ * @section functions Implemented functions
+ *
+ * We give here the list of implemented functions. They are grouped by
+ * family; to each family corresponds a specific header file. Each
+ * individual function has its associated "short name". Please refer to
+ * the documentation for that header file to get details on the hash
+ * function denomination and provenance.
+ *
+ * Note: the functions marked with a '(64)' in the list below are
+ * available only if the C compiler provides an integer type of length
+ * 64 bits or more. Such a type is mandatory in the latest C standard
+ * (ISO 9899:1999, aka "C99") and is present in several older compilers
+ * as well, so chances are that such a type is available.
+ *
+ * - HAVAL family: file <code>sph_haval.h</code>
+ *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
+ *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
+ *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
+ *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
+ *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
+ *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
+ *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
+ *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
+ *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
+ *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
+ *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
+ *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
+ *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
+ *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
+ *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
+ * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
+ * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
+ * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
+ * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
+ * - RadioGatun family: file <code>sph_radiogatun.h</code>
+ *   - RadioGatun[32]: short name: <code>radiogatun32</code>
+ *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
+ * - RIPEMD family: file <code>sph_ripemd.h</code>
+ *   - RIPEMD: short name: <code>ripemd</code>
+ *   - RIPEMD-128: short name: <code>ripemd128</code>
+ *   - RIPEMD-160: short name: <code>ripemd160</code>
+ * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
+ * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
+ * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
+ *   - SHA-224: short name: <code>sha224</code>
+ *   - SHA-256: short name: <code>sha256</code>
+ *   - SHA-384: short name: <code>sha384</code> (64)
+ *   - SHA-512: short name: <code>sha512</code> (64)
+ * - Tiger family: file <code>sph_tiger.h</code>
+ *   - Tiger: short name: <code>tiger</code> (64)
+ *   - Tiger2: short name: <code>tiger2</code> (64)
+ * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
+ *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
+ *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
+ *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
+ *
+ * The fourteen second-round SHA-3 candidates are also implemented;
+ * when applicable, the implementations follow the "final" specifications
+ * as published for the third round of the SHA-3 competition (BLAKE,
+ * Groestl, JH, Keccak and Skein have been tweaked for third round).
+ *
+ * - BLAKE family: file <code>sph_blake.h</code>
+ *   - BLAKE-224: short name: <code>blake224</code>
+ *   - BLAKE-256: short name: <code>blake256</code>
+ *   - BLAKE-384: short name: <code>blake384</code>
+ *   - BLAKE-512: short name: <code>blake512</code>
+ * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
+ *   - BMW-224: short name: <code>bmw224</code>
+ *   - BMW-256: short name: <code>bmw256</code>
+ *   - BMW-384: short name: <code>bmw384</code> (64)
+ *   - BMW-512: short name: <code>bmw512</code> (64)
+ * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
+ *   CubeHash16/32 in the CubeHash specification)
+ *   - CubeHash-224: short name: <code>cubehash224</code>
+ *   - CubeHash-256: short name: <code>cubehash256</code>
+ *   - CubeHash-384: short name: <code>cubehash384</code>
+ *   - CubeHash-512: short name: <code>cubehash512</code>
+ * - ECHO family: file <code>sph_echo.h</code>
+ *   - ECHO-224: short name: <code>echo224</code>
+ *   - ECHO-256: short name: <code>echo256</code>
+ *   - ECHO-384: short name: <code>echo384</code>
+ *   - ECHO-512: short name: <code>echo512</code>
+ * - Fugue family: file <code>sph_fugue.h</code>
+ *   - Fugue-224: short name: <code>fugue224</code>
+ *   - Fugue-256: short name: <code>fugue256</code>
+ *   - Fugue-384: short name: <code>fugue384</code>
+ *   - Fugue-512: short name: <code>fugue512</code>
+ * - Groestl family: file <code>sph_groestl.h</code>
+ *   - Groestl-224: short name: <code>groestl224</code>
+ *   - Groestl-256: short name: <code>groestl256</code>
+ *   - Groestl-384: short name: <code>groestl384</code>
+ *   - Groestl-512: short name: <code>groestl512</code>
+ * - Hamsi family: file <code>sph_hamsi.h</code>
+ *   - Hamsi-224: short name: <code>hamsi224</code>
+ *   - Hamsi-256: short name: <code>hamsi256</code>
+ *   - Hamsi-384: short name: <code>hamsi384</code>
+ *   - Hamsi-512: short name: <code>hamsi512</code>
+ * - JH family: file <code>sph_jh.h</code>
+ *   - JH-224: short name: <code>jh224</code>
+ *   - JH-256: short name: <code>jh256</code>
+ *   - JH-384: short name: <code>jh384</code>
+ *   - JH-512: short name: <code>jh512</code>
+ * - Keccak family: file <code>sph_keccak.h</code>
+ *   - Keccak-224: short name: <code>keccak224</code>
+ *   - Keccak-256: short name: <code>keccak256</code>
+ *   - Keccak-384: short name: <code>keccak384</code>
+ *   - Keccak-512: short name: <code>keccak512</code>
+ * - Luffa family: file <code>sph_luffa.h</code>
+ *   - Luffa-224: short name: <code>luffa224</code>
+ *   - Luffa-256: short name: <code>luffa256</code>
+ *   - Luffa-384: short name: <code>luffa384</code>
+ *   - Luffa-512: short name: <code>luffa512</code>
+ * - Shabal family: file <code>sph_shabal.h</code>
+ *   - Shabal-192: short name: <code>shabal192</code>
+ *   - Shabal-224: short name: <code>shabal224</code>
+ *   - Shabal-256: short name: <code>shabal256</code>
+ *   - Shabal-384: short name: <code>shabal384</code>
+ *   - Shabal-512: short name: <code>shabal512</code>
+ * - SHAvite-3 family: file <code>sph_shavite.h</code>
+ *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
+ *     short name: <code>shabal224</code>
+ *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
+ *     short name: <code>shabal256</code>
+ *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
+ *     short name: <code>shabal384</code>
+ *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
+ *     short name: <code>shabal512</code>
+ * - SIMD family: file <code>sph_simd.h</code>
+ *   - SIMD-224: short name: <code>simd224</code>
+ *   - SIMD-256: short name: <code>simd256</code>
+ *   - SIMD-384: short name: <code>simd384</code>
+ *   - SIMD-512: short name: <code>simd512</code>
+ * - Skein family: file <code>sph_skein.h</code>
+ *   - Skein-224 (nominally specified as Skein-512-224): short name:
+ *     <code>skein224</code> (64)
+ *   - Skein-256 (nominally specified as Skein-512-256): short name:
+ *     <code>skein256</code> (64)
+ *   - Skein-384 (nominally specified as Skein-512-384): short name:
+ *     <code>skein384</code> (64)
+ *   - Skein-512 (nominally specified as Skein-512-512): short name:
+ *     <code>skein512</code> (64)
+ *
+ * For the second-round SHA-3 candidates, the functions are as specified
+ * for round 2, i.e. with the "tweaks" that some candidates added
+ * between round 1 and round 2. Also, some of the submitted packages for
+ * round 2 contained errors, in the specification, reference code, or
+ * both. <code>sphlib</code> implements the corrected versions.
+ */
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 32 bits; on most
+ * architectures, it will have a width of exactly 32 bits. Unsigned C
+ * types implement arithmetics modulo a power of 2; use the
+ * <code>SPH_T32()</code> macro to ensure that the value is truncated
+ * to exactly 32 bits. Unless otherwise specified, all macros and
+ * functions which accept <code>sph_u32</code> values assume that these
+ * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
+ * where <code>sph_u32</code> is larger than that.
+ */
+typedef __arch_dependant__ sph_u32;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u32</code>; it has
+ * width 32 bits or more.
+ */
+typedef __arch_dependant__ sph_s32;
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 64 bits; on most
+ * architectures which feature such a type, it will have a width of
+ * exactly 64 bits. C99-compliant platform will have this type; it
+ * is also defined when the GNU compiler (gcc) is used, and on
+ * platforms where <code>unsigned long</code> is large enough. If this
+ * type is not available, then some hash functions which depends on
+ * a 64-bit type will not be available (most notably SHA-384, SHA-512,
+ * Tiger and WHIRLPOOL).
+ */
+typedef __arch_dependant__ sph_u64;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u64</code>; it has
+ * width 64 bits or more.
+ */
+typedef __arch_dependant__ sph_s64;
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u32</code>. Depending on
+ * how this type is defined, a suffix such as <code>UL</code> may
+ * be appended to the argument.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C32(x)
+
+/**
+ * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler.
+ *
+ * @param x   the value to truncate (of type <code>sph_u32</code>)
+ */
+#define SPH_T32(x)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTL32(x, n)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTR32(x, n)
+
+/**
+ * This macro is defined on systems for which a 64-bit type has been
+ * detected, and is used for <code>sph_u64</code>.
+ */
+#define SPH_64
+
+/**
+ * This macro is defined on systems for the "native" integer size is
+ * 64 bits (64-bit values fit in one register).
+ */
+#define SPH_64_TRUE
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u64</code>. Depending on
+ * how this type is defined, a suffix such as <code>ULL</code> may
+ * be appended to the argument. This macro is defined only if a
+ * 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C64(x)
+
+/**
+ * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler. This macro is defined only
+ * if a 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to truncate (of type <code>sph_u64</code>)
+ */
+#define SPH_T64(x)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTL64(x, n)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTR64(x, n)
+
+/**
+ * This macro evaluates to <code>inline</code> or an equivalent construction,
+ * if available on the compilation platform, or to nothing otherwise. This
+ * is used to declare inline functions, for which the compiler should
+ * endeavour to include the code directly in the caller. Inline functions
+ * are typically defined in header files as replacement for macros.
+ */
+#define SPH_INLINE
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * little-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_LITTLE_ENDIAN
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * big-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_BIG_ENDIAN
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in little-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the big-endian platforms which have special little-endian access
+ * opcodes (e.g. Ultrasparc).
+ */
+#define SPH_LITTLE_FAST
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in big-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the little-endian platforms which have special big-endian access
+ * opcodes.
+ */
+#define SPH_BIG_FAST
+
+/**
+ * On some platforms, this macro is defined to an unsigned integer type
+ * into which pointer values may be cast. The resulting value can then
+ * be tested for being a multiple of 2, 4 or 8, indicating an aligned
+ * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
+ */
+#define SPH_UPTR
+
+/**
+ * When defined, this macro indicates that unaligned memory accesses
+ * are possible with only a minor penalty, and thus should be prefered
+ * over strategies which first copy data to an aligned buffer.
+ */
+#define SPH_UNALIGNED
+
+/**
+ * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
+ * <code>0x78563412</code>). This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance.
+ *
+ * @param x   the 32-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u32 sph_bswap32(sph_u32 x);
+
+/**
+ * Byte-swap a 64-bit word. This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance. This
+ * function is defined only if a suitable 64-bit type was found for
+ * <code>sph_u64</code>
+ *
+ * @param x   the 64-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u64 sph_bswap64(sph_u64 x);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16le(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16le(void *dst, unsigned val);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16be(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16be(void *dst, unsigned val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32le()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32le()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32be()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32be()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64le()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64le()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64be()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64be()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
+
+#endif
+
+/* ============== END documentation block for Doxygen ============= */
+
+#ifndef DOXYGEN_IGNORE
+
+/*
+ * We want to define the types "sph_u32" and "sph_u64" which hold
+ * unsigned values of at least, respectively, 32 and 64 bits. These
+ * tests should select appropriate types for most platforms. The
+ * macro "SPH_64" is defined if the 64-bit is supported.
+ */
+
+#undef SPH_64
+#undef SPH_64_TRUE
+
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+
+/*
+ * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
+ * type, if any, or otherwise use a wider type (which must exist, for
+ * C99 conformance).
+ */
+
+#include <stdint.h>
+
+#ifdef UINT32_MAX
+typedef uint32_t sph_u32;
+typedef int32_t sph_s32;
+#else
+typedef uint_fast32_t sph_u32;
+typedef int_fast32_t sph_s32;
+#endif
+#if !SPH_NO_64
+#ifdef UINT64_MAX
+typedef uint64_t sph_u64;
+typedef int64_t sph_s64;
+#else
+typedef uint_fast64_t sph_u64;
+typedef int_fast64_t sph_s64;
+#endif
+#endif
+
+#define SPH_C32(x)    ((sph_u32)(x))
+#if !SPH_NO_64
+#define SPH_C64(x)    ((sph_u64)(x))
+#define SPH_64  1
+#endif
+
+#else
+
+/*
+ * On non-C99 systems, we use "unsigned int" if it is wide enough,
+ * "unsigned long" otherwise. This supports all "reasonable" architectures.
+ * We have to be cautious: pre-C99 preprocessors handle constants
+ * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
+ */
+
+#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+
+#else
+
+typedef unsigned long sph_u32;
+typedef long sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## UL))
+
+#endif
+
+#if !SPH_NO_64
+
+/*
+ * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
+ * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
+ * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
+ * test whether "unsigned long long" is available; we also know that
+ * gcc features this type, even if the libc header do not know it.
+ */
+
+#if ((ULONG_MAX >> 31) >> 31) >= 3
+
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+
+#define SPH_64  1
+
+#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
+
+typedef unsigned long long sph_u64;
+typedef long long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## ULL))
+
+#define SPH_64  1
+
+#else
+
+/*
+ * No 64-bit type...
+ */
+
+#endif
+
+#endif
+
+#endif
+
+/*
+ * If the "unsigned long" type has length 64 bits or more, then this is
+ * a "true" 64-bit architectures. This is also true with Visual C on
+ * amd64, even though the "long" type is limited to 32 bits.
+ */
+#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
+#define SPH_64_TRUE   1
+#endif
+
+/*
+ * Implementation note: some processors have specific opcodes to perform
+ * a rotation. Recent versions of gcc recognize the expression above and
+ * use the relevant opcodes, when appropriate.
+ */
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#if SPH_64
+
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
+
+#endif
+
+#ifndef DOXYGEN_IGNORE
+/*
+ * Define SPH_INLINE to be an "inline" qualifier, if available. We define
+ * some small macro-like functions which benefit greatly from being inlined.
+ */
+#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
+#define SPH_INLINE inline
+#elif defined _MSC_VER
+#define SPH_INLINE __inline
+#else
+#define SPH_INLINE
+#endif
+#endif
+
+/*
+ * We define some macros which qualify the architecture. These macros
+ * may be explicit set externally (e.g. as compiler parameters). The
+ * code below sets those macros if they are not already defined.
+ *
+ * Most macros are boolean, thus evaluate to either zero or non-zero.
+ * The SPH_UPTR macro is special, in that it evaluates to a C type,
+ * or is not defined.
+ *
+ * SPH_UPTR             if defined: unsigned type to cast pointers into
+ *
+ * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
+ * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
+ * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
+ * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
+ * SPH_BIG_FAST         non-zero if big-endian decoding is fast
+ *
+ * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
+ * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
+ * _must_ be non-zero in those situations. The 32-bit and 64-bit types
+ * _must_ also have an exact width.
+ *
+ * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
+ * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
+ * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
+ * SPH_I386_GCC         x86-compatible (32-bit) with gcc
+ * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
+ * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
+ * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
+ * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
+ * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
+ *
+ * TODO: enhance automatic detection, for more architectures and compilers.
+ * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
+ * some very fast functions (e.g. MD4) when using unaligned input data.
+ * The CPU-specific-with-GCC macros are useful only for inline assembly,
+ * normally restrained to this header file.
+ */
+
+/*
+ * 32-bit x86, aka "i386 compatible".
+ */
+#if defined __i386__ || defined _M_IX86
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u32
+#ifdef __GNUC__
+#define SPH_DETECT_I386_GCC          1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_I386_MSVC         1
+#endif
+
+/*
+ * 64-bit x86, hereafter known as "amd64".
+ */
+#elif defined __x86_64 || defined _M_X64
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_AMD64_GCC         1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_AMD64_MSVC        1
+#endif
+
+/*
+ * 64-bit Sparc architecture (implies v9).
+ */
+#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
+	|| defined __sparcv9
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_SPARCV9_GCC_64    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * 32-bit Sparc.
+ */
+#elif (defined __sparc__ || defined __sparc) \
+	&& !(defined __sparcv9 || defined __arch64__)
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u32
+#if defined __GNUC__ && defined __sparc_v9__
+#define SPH_DETECT_SPARCV9_GCC_32    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * ARM, little-endian.
+ */
+#elif defined __arm__ && __ARMEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, little-endian.
+ */
+#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, big-endian.
+ */
+#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
+
+#define SPH_DETECT_BIG_ENDIAN        1
+
+/*
+ * PowerPC.
+ */
+#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
+	|| defined _ARCH_PPC
+
+/*
+ * Note: we do not declare cross-endian access to be "fast": even if
+ * using inline assembly, implementation should still assume that
+ * keeping the decoded word in a temporary is faster than decoding
+ * it again.
+ */
+#if defined __GNUC__
+#if SPH_64_TRUE
+#define SPH_DETECT_PPC64_GCC         1
+#else
+#define SPH_DETECT_PPC32_GCC         1
+#endif
+#endif
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+
+/*
+ * Itanium, 64-bit.
+ */
+#elif defined __ia64 || defined __ia64__ \
+	|| defined __itanium__ || defined _M_IA64
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#else
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+#if defined __LP64__ || defined _LP64
+#define SPH_DETECT_UPTR              sph_u64
+#else
+#define SPH_DETECT_UPTR              sph_u32
+#endif
+
+#endif
+
+#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
+#define SPH_DETECT_SPARCV9_GCC       1
+#endif
+
+#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
+#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
+#endif
+#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
+#define SPH_UPTR              SPH_DETECT_UPTR
+#endif
+#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
+#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
+#endif
+#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
+#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
+#endif
+#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
+#endif
+#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
+#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
+#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
+#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
+#endif
+#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
+#define SPH_I386_GCC          SPH_DETECT_I386_GCC
+#endif
+#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
+#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
+#endif
+#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
+#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
+#endif
+#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
+#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
+#endif
+#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
+#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
+#endif
+#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
+#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
+#endif
+
+#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST              1
+#endif
+#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST                 1
+#endif
+
+#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
+#error SPH_UPTR defined, but endianness is not known.
+#endif
+
+#if SPH_I386_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+#elif SPH_AMD64_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * and 64-bit values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#endif
+
+/*
+ * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
+ * to generate proper opcodes for endianness swapping with the pure C
+ * implementation below.
+ *
+
+#elif SPH_I386_MSVC && !SPH_NO_ASM
+
+static __inline sph_u32 __declspec(naked) __fastcall
+sph_bswap32(sph_u32 x)
+{
+	__asm {
+		bswap  ecx
+		mov    eax,ecx
+		ret
+	}
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+ *
+ * [end of disabled code]
+ */
+
+#else
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	x = SPH_T32((x << 16) | (x >> 16));
+	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
+		| ((x & SPH_C32(0x00FF00FF)) << 8);
+	return x;
+}
+
+#if SPH_64
+
+/**
+ * Byte-swap a 64-bit value.
+ *
+ * @param x   the input value
+ * @return  the byte-swapped value
+ */
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	x = SPH_T64((x << 32) | (x >> 32));
+	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
+		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
+	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
+		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
+	return x;
+}
+
+#endif
+
+#endif
+
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+
+/*
+ * On UltraSPARC systems, native ordering is big-endian, but it is
+ * possible to perform little-endian read accesses by specifying the
+ * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
+ * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
+ * contains the source address and %dst is the destination register,
+ * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
+ * to get the address space name. The latter format is better since it
+ * combines an addition and the actual access in a single opcode; but
+ * it requires the setting (and subsequent resetting) of %asi, which is
+ * slow. Some operations (i.e. MD5 compression function) combine many
+ * successive little-endian read accesses, which may share the same
+ * %asi setting. The macros below contain the appropriate inline
+ * assembly.
+ */
+
+#define SPH_SPARCV9_SET_ASI   \
+	sph_u32 sph_sparcv9_asi; \
+	__asm__ __volatile__ ( \
+		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_RESET_ASI  \
+	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
+		sph_u32 sph_sparcv9_tmp; \
+		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
+			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
+		sph_sparcv9_tmp; \
+	})
+
+#endif
+
+static SPH_INLINE void
+sph_enc16be(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = (val >> 8);
+	((unsigned char *)dst)[1] = val;
+}
+
+static SPH_INLINE unsigned
+sph_dec16be(const void *src)
+{
+	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
+		| (unsigned)(((const unsigned char *)src)[1]);
+}
+
+static SPH_INLINE void
+sph_enc16le(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = val >> 8;
+}
+
+static SPH_INLINE unsigned
+sph_dec16le(const void *src)
+{
+	return (unsigned)(((const unsigned char *)src)[0])
+		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32be(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 24);
+		((unsigned char *)dst)[1] = (val >> 16);
+		((unsigned char *)dst)[2] = (val >> 8);
+		((unsigned char *)dst)[3] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32be_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap32(*(const sph_u32 *)src);
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+			| (sph_u32)(((const unsigned char *)src)[3]);
+	}
+#endif
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u32 *)src;
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32le(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32le_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		/*
+		 * "__volatile__" is needed here because without it,
+		 * gcc-3.4.3 miscompiles the code and performs the
+		 * access before the test on the address, thus triggering
+		 * a bus error...
+		 */
+		__asm__ __volatile__ (
+			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * On PowerPC, this turns out not to be worth the effort: the inline
+ * assembly makes GCC optimizer uncomfortable, which tends to nullify
+ * the decoding gains.
+ *
+ * For most hash functions, using this inline assembly trick changes
+ * hashing speed by less than 5% and often _reduces_ it. The biggest
+ * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
+ * less then 10%. The speed gain on CubeHash is probably due to the
+ * chronic shortage of registers that CubeHash endures; for the other
+ * functions, the generic code appears to be efficient enough already.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		__asm__ __volatile__ (
+			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return (sph_u32)(((const unsigned char *)src)[0])
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+	}
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u32 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+#if SPH_64
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64be(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 56);
+		((unsigned char *)dst)[1] = (val >> 48);
+		((unsigned char *)dst)[2] = (val >> 40);
+		((unsigned char *)dst)[3] = (val >> 32);
+		((unsigned char *)dst)[4] = (val >> 24);
+		((unsigned char *)dst)[5] = (val >> 16);
+		((unsigned char *)dst)[6] = (val >> 8);
+		((unsigned char *)dst)[7] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64be_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap64(*(const sph_u64 *)src);
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+			| (sph_u64)(((const unsigned char *)src)[7]);
+	}
+#endif
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u64 *)src;
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64le(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+		((unsigned char *)dst)[4] = (val >> 32);
+		((unsigned char *)dst)[5] = (val >> 40);
+		((unsigned char *)dst)[6] = (val >> 48);
+		((unsigned char *)dst)[7] = (val >> 56);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64le_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+		return (sph_u64)sph_dec32le_aligned(src)
+			| ((sph_u64)sph_dec32le_aligned(
+				(const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return (sph_u64)(((const unsigned char *)src)[0])
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+	}
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u64 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+	return (sph_u64)sph_dec32le_aligned(src)
+		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+#endif
+
+#endif /* Doxygen excluded block */
+
+#endif

From 88d8e7ec2433182e2fe36cec95ae3be3881b2a60 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 12 Sep 2016 16:36:55 -0400
Subject: [PATCH 048/150] Bump for v0.4.1

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index 7ee119d..e5a9c8f 100644
--- a/version.go
+++ b/version.go
@@ -32,7 +32,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 const (
 	appMajor uint = 0
 	appMinor uint = 4
-	appPatch uint = 0
+	appPatch uint = 1
 
 	// appPreRelease MUST only contain characters from semanticAlphabet
 	// per the semantic versioning spec.

From 8a315cd482c8f0e2e2ffcbcfeb6c29e2c0235a29 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 13 Sep 2016 14:16:54 -0400
Subject: [PATCH 049/150] Small optimization for CUDA.

Suggested by cj.

Was getting an average hashrate of 1.455GH/s
before.  Now getting 1.512GH/s

Tested on Arch Linux with a GeForce GTX 970.
---
 decred.cu | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/decred.cu b/decred.cu
index c5cd420..e37af60 100644
--- a/decred.cu
+++ b/decred.cu
@@ -170,17 +170,9 @@ __global__ void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t sta
 		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS(    2, 7, 8, 13);
 
 		if ((c_h[1]^v[15]) == v[7]) {
-			v[ 3] += c_xors[i++] + v[4];
-			v[14] = ROL16(v[14] ^ v[3]);
-			v[ 9] += v[14];
-			v[ 4] = ROTR32(v[4] ^ v[9], 12);
-			v[ 3] += c_xors[i++] + v[4];
-			v[14] = ROR8(v[14] ^ v[3]);
-			if(cuda_swab32((c_h[0]^v[6]^v[14])) <= highTarget) {
-				uint32_t pos = atomicInc(&resNonce[0], UINT32_MAX)+1;
-				resNonce[pos] = nonce;
-				return;
-			}
+		        uint32_t pos = atomicInc(&resNonce[0], UINT32_MAX)+1;
+			resNonce[pos] = nonce;
+			return;
 		}
 	}
 }

From 5ca393cf2d39de56ad49fea61a9c80d607ea8678 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 13 Sep 2016 13:44:07 -0500
Subject: [PATCH 050/150] adjust various headers so windows builds (#89)

---
 miner.h | 46 ++--------------------------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

diff --git a/miner.h b/miner.h
index b68e831..f2c75c3 100644
--- a/miner.h
+++ b/miner.h
@@ -7,11 +7,10 @@ extern "C" {
 
 //#include <ccminer-config.h>
 
+#include <stdio.h>
+#include <stdlib.h>
 #include <stdbool.h>
 #include <inttypes.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <curl/curl.h>
 
 #ifdef _MSC_VER
 #undef HAVE_ALLOCA_H
@@ -296,12 +295,6 @@ struct cgpu_info {
 	uint32_t throughput;
 };
 
-struct thr_api {
-	int id;
-	pthread_t pth;
-	struct thread_q	*q;
-};
-
 struct stats_data {
 	uint32_t uid;
 	uint32_t tm_stat;
@@ -339,12 +332,6 @@ struct hashlog_data {
 
 /* end of api */
 
-struct thr_info {
-	int		id;
-	pthread_t	pth;
-	struct thread_q	*q;
-	struct cgpu_info gpu;
-};
 
 struct work_restart {
 	/* volatile to modify accross threads (vstudio thing) */
@@ -385,7 +372,6 @@ extern long opt_proxy_type;
 extern bool use_syslog;
 extern bool use_colors;
 extern int use_pok;
-extern pthread_mutex_t applog_lock;
 extern struct thr_info *thr_info;
 extern int longpoll_thr_id;
 extern int stratum_thr_id;
@@ -505,33 +491,6 @@ struct stratum_job {
 	double diff;
 };
 
-struct stratum_ctx {
-	char *url;
-
-	CURL *curl;
-	char *curl_url;
-	char curl_err_str[CURL_ERROR_SIZE];
-	curl_socket_t sock;
-	size_t sockbuf_size;
-	char *sockbuf;
-
-	double next_diff;
-	double sharediff;
-
-	char *session_id;
-	size_t xnonce1_size;
-	unsigned char *xnonce1;
-	size_t xnonce2_size;
-	struct stratum_job job;
-
-	struct timeval tv_submit;
-	uint32_t answer_msec;
-	int pooln;
-	time_t tm_connected;
-
-	int srvtime_diff;
-};
-
 #define POK_MAX_TXS   4
 #define POK_MAX_TX_SZ 16384U
 struct tx {
@@ -600,7 +559,6 @@ struct pool_infos {
 	int time_limit;
 	int scantime;
 	// connection
-	struct stratum_ctx stratum;
 	uint8_t allow_gbt;
 	uint8_t allow_mininginfo;
 	uint16_t check_dups; // 16_t for align

From f823870d3e9e84c58f78b19e787b4ab8d1181edd Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 13 Sep 2016 14:45:46 -0500
Subject: [PATCH 051/150] add result field so errors are unmarshaled properly
 (#90)

---
 stratum/stratum.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stratum/stratum.go b/stratum/stratum.go
index bc9de91..78db377 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -114,6 +114,7 @@ type StratumRsp struct {
 type StratErr struct {
 	ErrNum uint64
 	ErrStr string
+	Result *json.RawMessage `json:"result,omitempty"`
 }
 
 // Basic reply is a reply type for any of the simple messages.

From 2af2dc96b19b72bb111d796d4027d550ff617747 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 13 Sep 2016 16:26:18 -0400
Subject: [PATCH 052/150] gofmt (#91)

---
 cl/context.go   | 1 -
 cl/device.go    | 3 +--
 cl/image.go     | 3 +--
 cl/image11.go   | 3 +--
 cl/kernel.go    | 1 -
 cl/memory.go    | 1 -
 cl/platform.go  | 1 -
 cl/program.go   | 1 -
 cl/program11.go | 1 -
 cl/queue.go     | 1 -
 cl/sampler.go   | 1 -
 11 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/cl/context.go b/cl/context.go
index e817f0e..affcbbc 100644
--- a/cl/context.go
+++ b/cl/context.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/device.go b/cl/device.go
index bec109e..3558406 100644
--- a/cl/device.go
+++ b/cl/device.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
@@ -14,7 +13,7 @@ package cl
 #else
 #include "CL/opencl.h"
 #endif
- */
+*/
 import "C"
 import "unsafe"
 
diff --git a/cl/image.go b/cl/image.go
index c4dd353..73565ae 100644
--- a/cl/image.go
+++ b/cl/image.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
@@ -14,7 +13,7 @@ package cl
 #else
 #include "CL/opencl.h"
 #endif
- */
+*/
 import "C"
 
 import (
diff --git a/cl/image11.go b/cl/image11.go
index b9a68fe..8133ae9 100644
--- a/cl/image11.go
+++ b/cl/image11.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
@@ -14,7 +13,7 @@ package cl
 #else
 #include "CL/opencl.h"
 #endif
- */
+*/
 import "C"
 import "unsafe"
 
diff --git a/cl/kernel.go b/cl/kernel.go
index 44d03f7..73b4ec9 100644
--- a/cl/kernel.go
+++ b/cl/kernel.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/memory.go b/cl/memory.go
index 55c0f8d..1c47c4b 100644
--- a/cl/memory.go
+++ b/cl/memory.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/platform.go b/cl/platform.go
index a316e73..cdc90c0 100644
--- a/cl/platform.go
+++ b/cl/platform.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/program.go b/cl/program.go
index 7c10d74..b36a287 100644
--- a/cl/program.go
+++ b/cl/program.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/program11.go b/cl/program11.go
index dd5c2fc..40a1c15 100644
--- a/cl/program11.go
+++ b/cl/program11.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/queue.go b/cl/queue.go
index 74b872c..e051a52 100644
--- a/cl/queue.go
+++ b/cl/queue.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*
diff --git a/cl/sampler.go b/cl/sampler.go
index 8b62c32..16f5780 100644
--- a/cl/sampler.go
+++ b/cl/sampler.go
@@ -1,4 +1,3 @@
-
 package cl
 
 /*

From 364bfc456ff60dcfe4752a6e4c6237bda090476a Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 13 Sep 2016 15:59:55 -0500
Subject: [PATCH 053/150] move deviceListIndex increment back to the right spot
 (#93)

---
 miner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miner.go b/miner.go
index 311d9a0..919089e 100644
--- a/miner.go
+++ b/miner.go
@@ -122,8 +122,8 @@ func NewMiner() (*Miner, error) {
 					if err != nil {
 						return nil, err
 					}
-					deviceListIndex++
 				}
+				deviceListIndex++
 			}
 
 			if deviceListEnabledCount == 0 {

From d85fd6f66efb1f1861bce93cca22d762e5f41f6f Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 14 Sep 2016 08:30:37 -0400
Subject: [PATCH 054/150] Clean up some old or incorrect comments.

Closes #94
---
 cudevice.go | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/cudevice.go b/cudevice.go
index 6b8826d..00ef58f 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -46,7 +46,6 @@ func getCUInfo() ([]cu.Device, error) {
 	ids := cu.DeviceGetCount()
 	minrLog.Infof("%v GPUs", ids)
 	var CUdevices []cu.Device
-	// XXX Do this more like ListCuDevices
 	for i := 0; i < ids; i++ {
 		dev := cu.DeviceGet(i)
 		CUdevices = append(CUdevices, dev)
@@ -140,7 +139,7 @@ func (d *Device) runCuDevice() error {
 	// Allocate the input region
 	d.cuContext.SetCurrent()
 
-	// kernel is built with nvcc, not an api call so much bet done
+	// kernel is built with nvcc, not an api call so must be done
 	// at compile time.
 
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
@@ -197,19 +196,14 @@ func (d *Device) runCuDevice() error {
 		// Execute the kernel and follow its execution time.
 		currentTime := time.Now()
 
-		// TODO Which nonceword is this?  In ccminer it is &pdata[35]
 		startNonce := d.lastBlock[work.Nonce1Word]
-		//fmt.Printf("%p %v\n", &startNonce, startNonce)
 
-		throughput := uint32(0x20000000) // TODO
-		//throughput = minUint32(throughput, ^uint32(0)-nonce)
-		//gridx := int((throughput + threadsPerBlock - 1) / threadsPerBlock)
-		//gridx := (int(throughput) + 639) / 640
+		throughput := uint32(0x20000000)
 		gridx := ((throughput - 1) / 640)
 
-		gridx = 52428 // don't ask me why this works.
+		gridx = 52428 // like ccminer
 
-		targetHigh := ^uint32(0) // TODO
+		targetHigh := ^uint32(0)
 
 		decredHashNonce(gridx, blockx, throughput, startNonce, nonceResultsD, targetHigh)
 

From b940a45ccfbb87ea9aac6422aae687c5ca144009 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Wed, 14 Sep 2016 11:55:49 -0400
Subject: [PATCH 055/150] fix cgo Go pointers issue (#92)

from
https://github.com/rainliu/gocl/commit/5e6a128b97580c6ffdb9eafd8c35ed6d0e074dbb

Also makes go vet happy.
---
 cl/context.go | 30 ++++++++++++++++++++----------
 cudevice.go   |  2 --
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/cl/context.go b/cl/context.go
index affcbbc..09f0e37 100644
--- a/cl/context.go
+++ b/cl/context.go
@@ -19,6 +19,10 @@ static void CL_CALLBACK c_ctx_notify(const char *errinfo, const void *private_in
 	go_ctx_notify((char *)errinfo, (void *)private_info, cb, user_data);
 }
 
+typedef void* pVoid;
+static pVoid* allocArray(size_t n) { return (pVoid*)malloc(n * sizeof(pVoid)); }
+static void   freeArray (pVoid* p) { free(p); }
+
 static cl_context CLCreateContext(	const cl_context_properties *  	properties,
 					                cl_uint                  		num_devices,
 					                const cl_device_id *     		devices,
@@ -92,12 +96,14 @@ func CLCreateContext(properties []CL_context_properties,
 		}
 
 		if pfn_notify != nil {
-			var c_user_data []unsafe.Pointer
-			c_user_data = make([]unsafe.Pointer, 2)
-			c_user_data[0] = user_data
-			c_user_data[1] = unsafe.Pointer(&pfn_notify)
+			//var c_user_data []unsafe.Pointer
+			//c_user_data = make([]unsafe.Pointer, 2)
+			arr := C.allocArray(2)
+			c_user_data := (*[2]C.pVoid)(unsafe.Pointer(arr))[:]
+			c_user_data[0] = (C.pVoid)(user_data)
+			c_user_data[1] = (C.pVoid)(unsafe.Pointer(&pfn_notify))
 
-			ctx_notify[c_user_data[1]] = pfn_notify
+			ctx_notify[unsafe.Pointer(&pfn_notify)] = pfn_notify
 
 			c_context = C.CLCreateContext(c_properties_ptr,
 				C.cl_uint(len(c_devices)),
@@ -105,6 +111,7 @@ func CLCreateContext(properties []CL_context_properties,
 				unsafe.Pointer(&c_user_data),
 				&c_errcode_ret)
 
+			C.freeArray(arr)
 		} else {
 			c_context = C.clCreateContext(c_properties_ptr,
 				C.cl_uint(len(c_devices)),
@@ -150,18 +157,21 @@ func CLCreateContextFromType(properties []CL_context_properties,
 		}
 
 		if pfn_notify != nil {
-			var c_user_data []unsafe.Pointer
-			c_user_data = make([]unsafe.Pointer, 2)
-			c_user_data[0] = user_data
-			c_user_data[1] = unsafe.Pointer(&pfn_notify)
+			//var c_user_data []unsafe.Pointer
+			//c_user_data = make([]unsafe.Pointer, 2)
+			arr := C.allocArray(2)
+			c_user_data := (*[2]C.pVoid)(unsafe.Pointer(arr))[:]
+			c_user_data[0] = (C.pVoid)(user_data)
+			c_user_data[1] = (C.pVoid)(unsafe.Pointer(&pfn_notify))
 
-			ctx_notify[c_user_data[1]] = pfn_notify
+			ctx_notify[unsafe.Pointer(&pfn_notify)] = pfn_notify
 
 			c_context = C.CLCreateContextFromType(c_properties_ptr,
 				C.cl_device_type(device_type),
 				unsafe.Pointer(&c_user_data),
 				&c_errcode_ret)
 
+			C.freeArray(arr)
 		} else {
 			c_context = C.clCreateContextFromType(c_properties_ptr,
 				C.cl_device_type(device_type),
diff --git a/cudevice.go b/cudevice.go
index 00ef58f..203d050 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -231,8 +231,6 @@ func (d *Device) runCuDevice() error {
 		minrLog.Tracef("GPU #%d: Kernel execution to read time: %v", d.index,
 			elapsedTime)
 	}
-
-	return nil
 }
 
 func minUint32(a, b uint32) uint32 {

From 5032c42a285955022400dfad121e754dc1aa82f5 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Wed, 14 Sep 2016 12:25:25 -0400
Subject: [PATCH 056/150] Hook up travis (#75)

---
 .travis.yml | 19 +++++++++++++++++++
 goclean.sh  | 17 +++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 .travis.yml
 create mode 100755 goclean.sh

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..f050815
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,19 @@
+language: go
+go:
+  - 1.6.3
+  - 1.7.1
+sudo: required
+dist: trusty
+before_install:
+  - sudo apt-get update
+  - sudo apt-get install opencl-headers nvidia-opencl-dev
+install:
+  - go get -v github.com/Masterminds/glide
+  - glide install
+  - go get -v golang.org/x/tools/cmd/cover
+  - go get -v github.com/bradfitz/goimports
+  - go get -v github.com/golang/lint/golint
+  - go get -v github.com/davecgh/go-spew/spew
+script:
+  - export PATH=$PATH:$HOME/gopath/bin
+  - ./goclean.sh
diff --git a/goclean.sh b/goclean.sh
new file mode 100755
index 0000000..2d0a89b
--- /dev/null
+++ b/goclean.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# The script does automatic checking on a Go package and its sub-packages, including:
+# 1. gofmt         (http://golang.org/cmd/gofmt/)
+# 2. golint        (https://github.com/golang/lint)
+# 3. go vet        (http://golang.org/cmd/vet)
+# 4. race detector (http://blog.golang.org/race-detector)
+# 5. test coverage (http://blog.golang.org/cover)
+
+set -ex
+
+# Automatic checks
+test -z "$(go fmt $(glide novendor) | tee /dev/stderr)"
+# TODO
+#test -z "$(for package in $(glide novendor); do golint $package; done | grep -v 'ALL_CAPS\|OP_\|NewFieldVal' | tee /dev/stderr)"
+test -z "$(go vet $(glide novendor) 2>&1 | tee /dev/stderr)"
+# TODO
+#env GORACE="halt_on_error=1" go test -v -race $(glide novendor)

From 7a61062d29d2e9157732583ba4392c2bd18ebb8a Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Wed, 14 Sep 2016 12:43:41 -0500
Subject: [PATCH 057/150] use nvml to fetch fan and temperature information
 (#96)

Adds https://github.com/abduld/nvml-go/ with some minor modifications.
---
 cudevice.go  |   64 +-
 device.go    |   63 +-
 miner.go     |    1 +
 nvml/LICENSE |   27 +
 nvml/nvml.go |  193 +++
 nvml/nvml.h  | 4382 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 4707 insertions(+), 23 deletions(-)
 create mode 100644 nvml/LICENSE
 create mode 100755 nvml/nvml.go
 create mode 100644 nvml/nvml.h

diff --git a/cudevice.go b/cudevice.go
index 203d050..78963dd 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -14,11 +14,13 @@ import (
 	"fmt"
 	"reflect"
 	"runtime"
+	"sync/atomic"
 	"time"
 	"unsafe"
 
 	"github.com/mumax/3/cuda/cu"
 
+	"github.com/decred/gominer/nvml"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
@@ -41,6 +43,39 @@ func decredHashNonce(gridx, blockx, threads uint32, startNonce uint32, nonceResu
 		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
 }
 
+func deviceInfoNVIDIA(index int) (uint32, uint32) {
+	fanPercent := uint32(0)
+	temperature := uint32(0)
+
+	err := nvml.Init()
+	if err != nil {
+		minrLog.Errorf("NVML Init error: %v", err)
+		return fanPercent, temperature
+	}
+
+	dh, err := nvml.DeviceGetHandleByIndex(index)
+	if err != nil {
+		minrLog.Errorf("NVML DeviceGetHandleByIndex error: %v", err)
+		return fanPercent, temperature
+	}
+
+	nvmlFanSpeed, err := nvml.DeviceFanSpeed(dh)
+	if err != nil {
+		minrLog.Infof("NVML DeviceFanSpeed error: %v", err)
+	} else {
+		fanPercent = uint32(nvmlFanSpeed)
+	}
+
+	nvmlTemp, err := nvml.DeviceTemperature(dh)
+	if err != nil {
+		minrLog.Infof("NVML DeviceTemperature error: %v", err)
+	} else {
+		temperature = uint32(nvmlTemp)
+	}
+
+	return fanPercent, temperature
+}
+
 func getCUInfo() ([]cu.Device, error) {
 	cu.Init(0)
 	ids := cu.DeviceGetCount()
@@ -68,7 +103,7 @@ func getCUDevices() ([]cu.Device, error) {
 	minMinor := 5
 
 	if maj < minMajor || (maj == minMajor && min < minMinor) {
-		return nil, fmt.Errorf("Driver does not suppoer CUDA %v.%v API", minMajor, minMinor)
+		return nil, fmt.Errorf("Driver does not support CUDA %v.%v API", minMajor, minMinor)
 	}
 
 	var numDevices int
@@ -103,23 +138,34 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 	workDone chan []byte) (*Device, error) {
 
 	d := &Device{
-		index:      index,
-		cuDeviceID: deviceID,
-		deviceName: deviceID.Name(),
-		cuda:       true,
-		quit:       make(chan struct{}),
-		newWork:    make(chan *work.Work, 5),
-		workDone:   workDone,
+		index:       index,
+		cuDeviceID:  deviceID,
+		deviceName:  deviceID.Name(),
+		cuda:        true,
+		kind:        "nvidia",
+		quit:        make(chan struct{}),
+		newWork:     make(chan *work.Work, 5),
+		workDone:    workDone,
+		fanPercent:  0,
+		temperature: 0,
 	}
 
 	d.cuInSize = 21
 
+	fanPercent, temperature := deviceInfoNVIDIA(d.index)
+	// Newer cards will idle with the fan off so just check if we got
+	// a good temperature reading
+	if temperature != 0 {
+		atomic.StoreUint32(&d.fanPercent, fanPercent)
+		atomic.StoreUint32(&d.temperature, temperature)
+		d.fanTempActive = true
+	}
+
 	d.started = uint32(time.Now().Unix())
 
 	// Autocalibrate?
 
 	return d, nil
-
 }
 
 func (d *Device) runCuDevice() error {
diff --git a/device.go b/device.go
index d7c6146..dcd0acd 100644
--- a/device.go
+++ b/device.go
@@ -11,6 +11,7 @@ import (
 	"math/big"
 	"os"
 	"sync"
+	"sync/atomic"
 	"time"
 	"unsafe"
 
@@ -66,19 +67,25 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 }
 
 type Device struct {
+	// The following variables must only be used atomically.
+	fanPercent  uint32
+	temperature uint32
+
 	sync.Mutex
 	index int
 	cuda  bool
 
 	// Items for OpenCL device
-	platformID   cl.CL_platform_id
-	deviceID     cl.CL_device_id
-	deviceName   string
-	context      cl.CL_context
-	queue        cl.CL_command_queue
-	outputBuffer cl.CL_mem
-	program      cl.CL_program
-	kernel       cl.CL_kernel
+	platformID    cl.CL_platform_id
+	deviceID      cl.CL_device_id
+	deviceName    string
+	context       cl.CL_context
+	queue         cl.CL_command_queue
+	outputBuffer  cl.CL_mem
+	program       cl.CL_program
+	kernel        cl.CL_kernel
+	fanTempActive bool
+	kind          string
 
 	// Items for CUDA device
 	cuDeviceID cu.Device
@@ -311,10 +318,38 @@ func (d *Device) PrintStats() {
 		float64(d.allDiffOneShares)) /
 		float64(secondsElapsed)
 
-	minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work",
-		d.index,
-		d.deviceName,
-		util.FormatHashRate(averageHashRate),
-		d.validShares,
-		d.validShares+d.invalidShares)
+	fanPercent := atomic.LoadUint32(&d.fanPercent)
+	temperature := atomic.LoadUint32(&d.temperature)
+
+	if fanPercent != 0 || temperature != 0 {
+		minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work, Fan=%v%% Temp=%vC",
+			d.index,
+			d.deviceName,
+			util.FormatHashRate(averageHashRate),
+			d.validShares,
+			d.validShares+d.invalidShares,
+			fanPercent,
+			temperature)
+	} else {
+		minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work",
+			d.index,
+			d.deviceName,
+			util.FormatHashRate(averageHashRate),
+			d.validShares,
+			d.validShares+d.invalidShares)
+	}
+}
+
+// UpdateFanTemp updates a device's statistics
+func (d *Device) UpdateFanTemp() {
+	d.Lock()
+	defer d.Unlock()
+	if d.fanTempActive {
+		switch d.kind {
+		case "nvidia":
+			fanPercent, temperature := deviceInfoNVIDIA(d.index)
+			atomic.StoreUint32(&d.fanPercent, fanPercent)
+			atomic.StoreUint32(&d.temperature, temperature)
+		}
+	}
 }
diff --git a/miner.go b/miner.go
index 919089e..2732d12 100644
--- a/miner.go
+++ b/miner.go
@@ -249,6 +249,7 @@ func (m *Miner) printStatsThread() {
 			}
 		}
 		for _, d := range m.devices {
+			d.UpdateFanTemp()
 			d.PrintStats()
 		}
 
diff --git a/nvml/LICENSE b/nvml/LICENSE
new file mode 100644
index 0000000..7277892
--- /dev/null
+++ b/nvml/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2014, abduld
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/nvml/nvml.go b/nvml/nvml.go
new file mode 100755
index 0000000..1a7eab1
--- /dev/null
+++ b/nvml/nvml.go
@@ -0,0 +1,193 @@
+package nvml
+
+// #cgo LDFLAGS: -lnvidia-ml
+// #include <stdio.h>
+// #include <stdlib.h>
+// #include <nvml.h>
+import "C"
+
+import (
+	"unsafe"
+)
+
+type ComputeMode C.nvmlComputeMode_t
+type Feature uint
+type ECCBitType uint
+type ECCCounterType uint
+type ClockType uint
+type DriverModel uint
+type PState C.nvmlPstates_t
+type InformObject uint
+type Result struct {
+	code C.nvmlReturn_t
+}
+
+func (r Result) String() string {
+	switch r.code {
+	case 0:
+		return "Success"
+	case 1:
+		return "Uninitialized"
+	case 2:
+		return "InvalidArgument"
+	case 3:
+		return "NotSupported"
+	case 4:
+		return "NoPermission"
+	case 5:
+		return "AlreadyInitialized"
+	case 6:
+		return "NotFound"
+	case 7:
+		return "InsufficientSize"
+	case 8:
+		return "InsufficientPower"
+	case 9:
+		return "DriverNotLoaded"
+	case 10:
+		return "Timeout"
+	case 99:
+		return "Unknown"
+	}
+	return "UnknownError"
+}
+
+func (r Result) Error() string {
+	return r.String()
+}
+
+func (r Result) SuccessQ() bool {
+	if r.code == 0 {
+		return true
+	} else {
+		return false
+	}
+}
+
+func NewResult(r C.nvmlReturn_t) error {
+	if r == 0 {
+		return nil
+	} else {
+		return &Result{r}
+	}
+}
+
+func Init() error {
+	r := C.nvmlInit()
+	return NewResult(r)
+}
+
+func Shutdown() error {
+	r := C.nvmlShutdown()
+	return NewResult(r)
+}
+
+func ErrorString(r Result) string {
+	s := C.nvmlErrorString(r.code)
+	return C.GoString(s)
+}
+
+func DeviceCount() (int, error) {
+	var count C.uint = 0
+	r := NewResult(C.nvmlDeviceGetCount(&count))
+	return int(count), r
+}
+
+type DeviceHandle struct {
+	handle C.nvmlDevice_t
+}
+
+func DeviceGetHandleByIndex(idx int) (DeviceHandle, error) {
+	var device C.nvmlDevice_t
+	r := NewResult(C.nvmlDeviceGetHandleByIndex(C.uint(idx), &device))
+	return DeviceHandle{device}, r
+}
+
+//compute mode
+
+func DeviceComputeMode(dh DeviceHandle) (ComputeMode, error) {
+	var mode C.nvmlComputeMode_t
+	r := NewResult(C.nvmlDeviceGetComputeMode(dh.handle, &mode))
+	return ComputeMode(mode), r
+}
+
+//device name
+
+const STRING_BUFFER_SIZE = 100
+
+func makeStringBuffer(sz int) *C.char {
+	b := make([]byte, sz)
+	return C.CString(string(b))
+}
+
+func DeviceName(dh DeviceHandle) (string, error) {
+	var name *C.char = makeStringBuffer(STRING_BUFFER_SIZE)
+	defer C.free(unsafe.Pointer(name))
+	r := NewResult(C.nvmlDeviceGetName(dh.handle, name, C.uint(STRING_BUFFER_SIZE)))
+	return C.GoStringN(name, STRING_BUFFER_SIZE), r
+}
+
+type MemoryInformation struct {
+	Used  uint64 `json:"used"`
+	Free  uint64 `json:"free"`
+	Total uint64 `json:"total"`
+}
+
+func DeviceMemoryInformation(dh DeviceHandle) (MemoryInformation, error) {
+	var temp C.nvmlMemory_t
+	r := NewResult(C.nvmlDeviceGetMemoryInfo(dh.handle, &temp))
+	if r == nil {
+		res := MemoryInformation{
+			Used:  uint64(temp.used),
+			Free:  uint64(temp.free),
+			Total: uint64(temp.total),
+		}
+		return res, nil
+	}
+	return MemoryInformation{}, r
+}
+
+type PCIInformation struct {
+	BusId       string `json:"bus_id"`
+	Domain      uint   `json:"domain"`
+	Bus         uint   `json:"bus"`
+	Device      uint   `json:"device"`
+	DeviceId    uint   `json:"device_id"`
+	SubSystemId uint   `json:"subsystem_id"`
+}
+
+func DevicePCIInformation(dh DeviceHandle) (PCIInformation, error) {
+	var temp C.nvmlPciInfo_t
+	r := NewResult(C.nvmlDeviceGetPciInfo(dh.handle, &temp))
+	if r == nil {
+		res := PCIInformation{
+			BusId: string(C.GoBytes(unsafe.Pointer(&temp.busId),
+				C.NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)),
+			Domain:      uint(temp.domain),
+			Bus:         uint(temp.bus),
+			Device:      uint(temp.device),
+			DeviceId:    uint(temp.pciDeviceId),
+			SubSystemId: uint(temp.pciSubSystemId),
+		}
+		return res, nil
+	}
+	return PCIInformation{}, r
+}
+
+func DeviceTemperature(dh DeviceHandle) (uint, error) {
+	var temp C.uint
+	r := NewResult(C.nvmlDeviceGetTemperature(dh.handle, C.nvmlTemperatureSensors_t(0), &temp))
+	return uint(temp), r
+}
+
+func DevicePerformanceState(dh DeviceHandle) (PState, error) {
+	var pstate C.nvmlPstates_t
+	r := NewResult(C.nvmlDeviceGetPerformanceState(dh.handle, &pstate))
+	return PState(pstate), r
+}
+
+func DeviceFanSpeed(dh DeviceHandle) (uint, error) {
+	var speed C.uint
+	r := NewResult(C.nvmlDeviceGetFanSpeed(dh.handle, &speed))
+	return uint(speed), r
+}
diff --git a/nvml/nvml.h b/nvml/nvml.h
new file mode 100644
index 0000000..c888841
--- /dev/null
+++ b/nvml/nvml.h
@@ -0,0 +1,4382 @@
+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  Users and possessors of this source code 
+ * are hereby granted a nonexclusive, royalty-free license to use this code 
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as 
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer  software"  and "commercial computer software 
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein. 
+ *
+ * Any use of this source code in individual and commercial software must 
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+/* 
+NVML API Reference
+
+The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and 
+managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
+3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
+tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
+
+API Documentation
+
+Supported platforms:
+- Windows:     Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
+- Linux:       32-bit and 64-bit
+- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
+
+Supported products:
+- Full Support
+    - All Tesla products, starting with the Fermi architecture
+    - All Quadro products, starting with the Fermi architecture
+    - All GRID products, starting with the Kepler architecture
+    - Selected GeForce Titan products
+- Limited Support
+    - All Geforce products, starting with the Fermi architecture
+
+The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
+not be added to the system path by default. To dynamically link to NVML, add this path to the PATH 
+environmental variable. To dynamically load NVML, call LoadLibrary with this path.
+
+On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
+and 64 bit NVML libraries will be installed.
+
+Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
+*/
+
+#ifndef __nvml_nvml_h__
+#define __nvml_nvml_h__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On Windows, set up methods for DLL export
+ * define NVML_STATIC_IMPORT when using nvml_loader library
+ */
+#if defined _WINDOWS
+    #if !defined NVML_STATIC_IMPORT
+        #if defined NVML_LIB_EXPORT
+            #define DECLDIR __declspec(dllexport)
+        #else
+            #define DECLDIR __declspec(dllimport)
+        #endif
+    #else
+        #define DECLDIR
+    #endif
+#else
+    #define DECLDIR
+#endif
+
+/**
+ * NVML API versioning support
+ */
+#define NVML_API_VERSION            8
+#define NVML_API_VERSION_STR        "8"
+#define nvmlInit                    nvmlInit_v2
+#define nvmlDeviceGetPciInfo        nvmlDeviceGetPciInfo_v2
+#define nvmlDeviceGetCount          nvmlDeviceGetCount_v2
+#define nvmlDeviceGetHandleByIndex  nvmlDeviceGetHandleByIndex_v2
+#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceStructs Device Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Special constant that some fields take when they are not available.
+ * Used when only part of the struct is not available.
+ *
+ * Each structure explicitly states when to check for this value.
+ */
+#define NVML_VALUE_NOT_AVAILABLE (-1)
+
+typedef struct nvmlDevice_st* nvmlDevice_t;
+
+/**
+ * Buffer size guaranteed to be large enough for pci bus id
+ */
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+/**
+ * PCI information about a GPU device.
+ */
+typedef struct nvmlPciInfo_st 
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+    
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+    
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+
+/**
+ * Detailed ECC error counts for a device.
+ *
+ * @deprecated  Different GPU families can have different memory error counters
+ *              See \ref nvmlDeviceGetMemoryErrorCounter
+ */
+typedef struct nvmlEccErrorCounts_st 
+{
+    unsigned long long l1Cache;      //!< L1 cache errors
+    unsigned long long l2Cache;      //!< L2 cache errors
+    unsigned long long deviceMemory; //!< Device memory errors
+    unsigned long long registerFile; //!< Register file errors
+} nvmlEccErrorCounts_t;
+
+/** 
+ * Utilization information for a device.
+ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
+ */
+typedef struct nvmlUtilization_st 
+{
+    unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
+    unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
+} nvmlUtilization_t;
+
+/** 
+ * Memory allocation information for a device.
+ */
+typedef struct nvmlMemory_st 
+{
+    unsigned long long total;        //!< Total installed FB memory (in bytes)
+    unsigned long long free;         //!< Unallocated FB memory (in bytes)
+    unsigned long long used;         //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
+/**
+ * BAR1 Memory allocation Information for a device
+ */
+typedef struct nvmlBAR1Memory_st
+{
+    unsigned long long bar1Total;    //!< Total BAR1 Memory (in bytes)
+    unsigned long long bar1Free;     //!< Unallocated BAR1 Memory (in bytes)
+    unsigned long long bar1Used;     //!< Allocated Used Memory (in bytes)
+}nvmlBAR1Memory_t;
+
+/**
+ * Information about running compute processes on the GPU
+ */
+typedef struct nvmlProcessInfo_st
+{
+    unsigned int pid;                 //!< Process ID
+    unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
+                                      //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
+                                      //! because Windows KMD manages all the memory and not the NVIDIA driver
+} nvmlProcessInfo_t;
+
+
+/**
+ * Enum to represent type of bridge chip
+ */
+typedef enum nvmlBridgeChipType_enum
+{
+    NVML_BRIDGE_CHIP_PLX = 0,
+    NVML_BRIDGE_CHIP_BRO4 = 1           
+}nvmlBridgeChipType_t;
+
+/**
+ * Maximum number of NvLink links supported 
+ */
+#define NVML_NVLINK_MAX_LINKS 4
+
+/**
+ * Enum to represent the NvLink utilization counter packet units
+ */
+typedef enum nvmlNvLinkUtilizationCountUnits_enum
+{
+    NVML_NVLINK_COUNTER_UNIT_CYCLES =  0,     // count by cycles
+    NVML_NVLINK_COUNTER_UNIT_PACKETS = 1,     // count by packets
+    NVML_NVLINK_COUNTER_UNIT_BYTES   = 2,     // count by bytes
+
+    // this must be last
+    NVML_NVLINK_COUNTER_UNIT_COUNT
+} nvmlNvLinkUtilizationCountUnits_t;
+
+/**
+ * Enum to represent the NvLink utilization counter packet types to count
+ *  ** this is ONLY applicable with the units as packets or bytes
+ *  ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
+ *  ** all packet filter descriptions are target GPU centric
+ *  ** these can be "OR'd" together 
+ */
+typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
+{
+    NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1,     // no operation packets
+    NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2,     // read packets
+    NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4,     // write packets
+    NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8,     // reduction atomic requests
+    NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10,    // non-reduction atomic requests
+    NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20,    // flush requests
+    NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40,    // responses with data
+    NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80,    // responses without data
+    NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF     // all packets
+} nvmlNvLinkUtilizationCountPktTypes_t;
+
+/** 
+ * Struct to define the NVLINK counter controls
+ */
+typedef struct nvmlNvLinkUtilizationControl_st
+{
+    nvmlNvLinkUtilizationCountUnits_t units;
+    nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
+} nvmlNvLinkUtilizationControl_t;
+
+/**
+ * Enum to represent NvLink queryable capabilities
+ */
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+/**
+ * Enum to represent NvLink queryable error counters
+ */
+typedef enum nvmlNvLinkErrorCounter_enum
+{
+    NVML_NVLINK_ERROR_DL_REPLAY   = 0,     // Data link transmit replay error counter
+    NVML_NVLINK_ERROR_DL_RECOVERY = 1,     // Data link transmit recovery error counter
+    NVML_NVLINK_ERROR_DL_CRC_FLIT = 2,     // Data link receive flow control digit CRC error counter
+    NVML_NVLINK_ERROR_DL_CRC_DATA = 3,     // Data link receive data CRC error counter
+
+    // this must be last
+    NVML_NVLINK_ERROR_COUNT
+} nvmlNvLinkErrorCounter_t;
+
+/**
+ * Represents level relationships within a system between two GPUs
+ * The enums are spaced to allow for future relationships
+ */
+typedef enum nvmlGpuLevel_enum
+{
+    NVML_TOPOLOGY_INTERNAL           = 0, // e.g. Tesla K80
+    NVML_TOPOLOGY_SINGLE             = 10, // all devices that only need traverse a single PCIe switch
+    NVML_TOPOLOGY_MULTIPLE           = 20, // all devices that need not traverse a host bridge
+    NVML_TOPOLOGY_HOSTBRIDGE         = 30, // all devices that are connected to the same host bridge
+    NVML_TOPOLOGY_CPU                = 40, // all devices that are connected to the same CPU but possibly multiple host bridges
+    NVML_TOPOLOGY_SYSTEM             = 50, // all devices in the system
+
+    // there is purposefully no COUNT here because of the need for spacing above
+} nvmlGpuTopologyLevel_t;
+
+
+/**
+ * Maximum limit on Physical Bridges per Board
+ */
+#define NVML_MAX_PHYSICAL_BRIDGE                         (128)
+
+/**
+ * Information about the Bridge Chip Firmware
+ */
+typedef struct nvmlBridgeChipInfo_st
+{
+    nvmlBridgeChipType_t type;                  //!< Type of Bridge Chip 
+    unsigned int fwVersion;                     //!< Firmware Version. 0=Version is unavailable
+}nvmlBridgeChipInfo_t;
+
+/**
+ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate 
+ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
+ */
+typedef struct nvmlBridgeChipHierarchy_st
+{
+    unsigned char  bridgeCount;                 //!< Number of Bridge Chips on the Board
+    nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
+}nvmlBridgeChipHierarchy_t;
+
+/**
+ *  Represents Type of Sampling Event
+ */
+typedef enum nvmlSamplingType_enum
+{
+    NVML_TOTAL_POWER_SAMPLES        = 0, //!< To represent total power drawn by GPU
+    NVML_GPU_UTILIZATION_SAMPLES    = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
+    NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
+    NVML_ENC_UTILIZATION_SAMPLES    = 3, //!< To represent percent of time during which NVENC remains busy
+    NVML_DEC_UTILIZATION_SAMPLES    = 4, //!< To represent percent of time during which NVDEC remains busy            
+    NVML_PROCESSOR_CLK_SAMPLES      = 5, //!< To represent processor clock samples
+    NVML_MEMORY_CLK_SAMPLES         = 6, //!< To represent memory clock samples
+            
+    // Keep this last
+    NVML_SAMPLINGTYPE_COUNT               
+}nvmlSamplingType_t;
+
+/**
+ * Represents the queryable PCIe utilization counters
+ */
+typedef enum nvmlPcieUtilCounter_enum
+{
+    NVML_PCIE_UTIL_TX_BYTES             = 0, // 1KB granularity
+    NVML_PCIE_UTIL_RX_BYTES             = 1, // 1KB granularity
+    
+    // Keep this last
+    NVML_PCIE_UTIL_COUNT
+} nvmlPcieUtilCounter_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum 
+{
+    NVML_VALUE_TYPE_DOUBLE = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+
+    // Keep this last
+    NVML_VALUE_TYPE_COUNT
+}nvmlValueType_t;
+
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st
+{
+    double dVal;                    //!< If the value is double
+    unsigned int uiVal;             //!< If the value is unsigned int
+    unsigned long ulVal;            //!< If the value is unsigned long
+    unsigned long long ullVal;      //!< If the value is unsigned long long
+}nvmlValue_t;
+
+/**
+ * Information for Sample
+ */
+typedef struct nvmlSample_st 
+{
+    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
+    nvmlValue_t sampleValue;        //!< Sample Value
+}nvmlSample_t;
+
+/**
+ * Represents type of perf policy for which violation times can be queried 
+ */
+typedef enum nvmlPerfPolicyType_enum
+{
+    NVML_PERF_POLICY_POWER = 0,
+    NVML_PERF_POLICY_THERMAL = 1,
+    NVML_PERF_POLICY_SYNC_BOOST = 2,
+
+    // Keep this last
+    NVML_PERF_POLICY_COUNT
+}nvmlPerfPolicyType_t;
+
+/**
+ * Struct to hold perf policy violation status data
+ */
+typedef struct nvmlViolationTime_st
+{
+    unsigned long long referenceTime;  //!< referenceTime represents CPU timestamp in microseconds
+    unsigned long long violationTime;  //!< violationTime in Nanoseconds
+}nvmlViolationTime_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceEnumvs Device Enums
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** 
+ * Generic enable/disable enum. 
+ */
+typedef enum nvmlEnableState_enum 
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled 
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
+#define nvmlFlagDefault     0x00      
+//! Generic flag used to force some behavior. See description of particular functions for details.
+#define nvmlFlagForce       0x01      
+
+/**
+ *  * The Brand of the GPU
+ *   */
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN = 0, 
+    NVML_BRAND_QUADRO  = 1,
+    NVML_BRAND_TESLA   = 2,
+    NVML_BRAND_NVS     = 3,
+    NVML_BRAND_GRID    = 4,
+    NVML_BRAND_GEFORCE = 5,
+
+    // Keep this last
+    NVML_BRAND_COUNT
+} nvmlBrandType_t;
+
+/**
+ * Temperature thresholds.
+ */
+typedef enum nvmlTemperatureThresholds_enum
+{
+    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down
+                                                // for HW protection
+    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin slowdown
+    // Keep this last
+    NVML_TEMPERATURE_THRESHOLD_COUNT
+} nvmlTemperatureThresholds_t;
+
+/** 
+ * Temperature sensors. 
+ */
+typedef enum nvmlTemperatureSensors_enum 
+{
+    NVML_TEMPERATURE_GPU      = 0,    //!< Temperature sensor for the GPU die
+    
+    // Keep this last
+    NVML_TEMPERATURE_COUNT
+} nvmlTemperatureSensors_t;
+
+/** 
+ * Compute mode. 
+ *
+ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
+ * Earlier CUDA versions supported a single exclusive mode, 
+ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
+ */
+typedef enum nvmlComputeMode_enum 
+{
+    NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
+    NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
+    NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
+    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
+    
+    // Keep this last
+    NVML_COMPUTEMODE_COUNT
+} nvmlComputeMode_t;
+
+/** 
+ * ECC bit types.
+ *
+ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
+ */
+#define nvmlEccBitType_t nvmlMemoryErrorType_t
+
+/**
+ * Single bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
+ */
+#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
+
+/**
+ * Double bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+ */
+#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+
+/**
+ * Memory error types
+ */
+typedef enum nvmlMemoryErrorType_enum
+{
+    /**
+     * A memory error that was corrected
+     * 
+     * For ECC errors, these are single bit errors
+     * For Texture memory, these are errors fixed by resend
+     */
+    NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
+    /**
+     * A memory error that was not corrected
+     * 
+     * For ECC errors, these are double bit errors
+     * For Texture memory, these are errors where the resend fails
+     */
+    NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
+    
+    
+    // Keep this last
+    NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
+
+} nvmlMemoryErrorType_t;
+
+/** 
+ * ECC counter types. 
+ *
+ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
+ *       On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver 
+ *       client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
+ *       is run.
+ */
+typedef enum nvmlEccCounterType_enum 
+{
+    NVML_VOLATILE_ECC      = 0,      //!< Volatile counts are reset each time the driver loads.
+    NVML_AGGREGATE_ECC     = 1,      //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
+    
+    // Keep this last
+    NVML_ECC_COUNTER_TYPE_COUNT      //!< Count of memory counter types
+} nvmlEccCounterType_t;
+
+/** 
+ * Clock types. 
+ * 
+ * All speeds are in Mhz.
+ */
+typedef enum nvmlClockType_enum 
+{
+    NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
+    NVML_CLOCK_SM        = 1,        //!< SM clock domain
+    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
+    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
+    
+    // Keep this last
+    NVML_CLOCK_COUNT //<! Count of clock types
+} nvmlClockType_t;
+
+/**
+ * Clock Ids.  These are used in combination with nvmlClockType_t
+ * to specify a single clock value.
+ */
+typedef enum nvmlClockId_enum
+{
+    NVML_CLOCK_ID_CURRENT            = 0,   //!< Current actual clock value
+    NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1,   //!< Target application clock
+    NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2,   //!< Default application clock target
+    NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3,   //!< OEM-defined maximum clock rate
+
+    //Keep this last
+    NVML_CLOCK_ID_COUNT //<! Count of Clock Ids.
+} nvmlClockId_t;
+
+/** 
+ * Driver models. 
+ *
+ * Windows only.
+ */
+typedef enum nvmlDriverModel_enum 
+{
+    NVML_DRIVER_WDDM      = 0,       //!< WDDM driver model -- GPU treated as a display device
+    NVML_DRIVER_WDM       = 1        //!< WDM (TCC) model (recommended) -- GPU treated as a generic device
+} nvmlDriverModel_t;
+
+/**
+ * Allowed PStates.
+ */
+typedef enum nvmlPStates_enum 
+{
+    NVML_PSTATE_0               = 0,       //!< Performance state 0 -- Maximum Performance
+    NVML_PSTATE_1               = 1,       //!< Performance state 1 
+    NVML_PSTATE_2               = 2,       //!< Performance state 2
+    NVML_PSTATE_3               = 3,       //!< Performance state 3
+    NVML_PSTATE_4               = 4,       //!< Performance state 4
+    NVML_PSTATE_5               = 5,       //!< Performance state 5
+    NVML_PSTATE_6               = 6,       //!< Performance state 6
+    NVML_PSTATE_7               = 7,       //!< Performance state 7
+    NVML_PSTATE_8               = 8,       //!< Performance state 8
+    NVML_PSTATE_9               = 9,       //!< Performance state 9
+    NVML_PSTATE_10              = 10,      //!< Performance state 10
+    NVML_PSTATE_11              = 11,      //!< Performance state 11
+    NVML_PSTATE_12              = 12,      //!< Performance state 12
+    NVML_PSTATE_13              = 13,      //!< Performance state 13
+    NVML_PSTATE_14              = 14,      //!< Performance state 14
+    NVML_PSTATE_15              = 15,      //!< Performance state 15 -- Minimum Performance 
+    NVML_PSTATE_UNKNOWN         = 32       //!< Unknown performance state
+} nvmlPstates_t;
+
+/**
+ * GPU Operation Mode
+ *
+ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
+ *
+ * Each GOM is designed to meet specific user needs.
+ */
+typedef enum nvmlGom_enum
+{
+    NVML_GOM_ALL_ON                    = 0, //!< Everything is enabled and running at full speed
+
+    NVML_GOM_COMPUTE                   = 1, //!< Designed for running only compute tasks. Graphics operations
+                                            //!< are not allowed
+
+    NVML_GOM_LOW_DP                    = 2  //!< Designed for running graphics applications that don't require
+                                            //!< high bandwidth double precision
+} nvmlGpuOperationMode_t;
+
+/** 
+ * Available infoROM objects.
+ */
+typedef enum nvmlInforomObject_enum 
+{
+    NVML_INFOROM_OEM            = 0,       //!< An object defined by OEM
+    NVML_INFOROM_ECC            = 1,       //!< The ECC object determining the level of ECC support
+    NVML_INFOROM_POWER          = 2,       //!< The power management object
+
+    // Keep this last
+    NVML_INFOROM_COUNT                     //!< This counts the number of infoROM objects the driver knows about
+} nvmlInforomObject_t;
+
+/** 
+ * Return values for NVML API calls. 
+ */
+typedef enum nvmlReturn_enum 
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+/**
+ * Memory locations
+ *
+ * See \ref nvmlDeviceGetMemoryErrorCounter
+ */
+typedef enum nvmlMemoryLocation_enum
+{
+    NVML_MEMORY_LOCATION_L1_CACHE = 0,       //!< GPU L1 Cache
+    NVML_MEMORY_LOCATION_L2_CACHE = 1,       //!< GPU L2 Cache
+    NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2,  //!< GPU Device Memory
+    NVML_MEMORY_LOCATION_REGISTER_FILE = 3,  //!< GPU Register File
+    NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory
+    
+    // Keep this last
+    NVML_MEMORY_LOCATION_COUNT              //!< This counts the number of memory locations the driver knows about
+} nvmlMemoryLocation_t;
+
+/**
+ * Causes for page retirement
+ */
+typedef enum nvmlPageRetirementCause_enum
+{
+    NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error
+    NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1,           //!< Page was retired due to double bit ECC error
+
+    // Keep this last
+    NVML_PAGE_RETIREMENT_CAUSE_COUNT
+} nvmlPageRetirementCause_t;
+
+/**
+ * API types that allow changes to default permission restrictions
+ */
+typedef enum nvmlRestrictedAPI_enum
+{
+    NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0,   //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks 
+                                                      //!< and see nvmlDeviceResetApplicationsClocks
+    NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1,  //!< APIs that enable/disable auto boosted clocks
+                                                      //!< see nvmlDeviceSetAutoBoostedClocksEnabled
+    // Keep this last
+    NVML_RESTRICTED_API_COUNT
+} nvmlRestrictedAPI_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitStructs Unit Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+typedef struct nvmlUnit_st* nvmlUnit_t;
+
+/** 
+ * Description of HWBC entry 
+ */
+typedef struct nvmlHwbcEntry_st 
+{
+    unsigned int hwbcId;
+    char firmwareVersion[32];
+} nvmlHwbcEntry_t;
+
+/** 
+ * Fan state enum. 
+ */
+typedef enum nvmlFanState_enum 
+{
+    NVML_FAN_NORMAL       = 0,     //!< Fan is working properly
+    NVML_FAN_FAILED       = 1      //!< Fan has failed
+} nvmlFanState_t;
+
+/** 
+ * Led color enum. 
+ */
+typedef enum nvmlLedColor_enum 
+{
+    NVML_LED_COLOR_GREEN       = 0,     //!< GREEN, indicates good health
+    NVML_LED_COLOR_AMBER       = 1      //!< AMBER, indicates problem
+} nvmlLedColor_t;
+
+
+/** 
+ * LED states for an S-class unit.
+ */
+typedef struct nvmlLedState_st 
+{
+    char cause[256];               //!< If amber, a text description of the cause
+    nvmlLedColor_t color;          //!< GREEN or AMBER
+} nvmlLedState_t;
+
+/** 
+ * Static S-class unit info.
+ */
+typedef struct nvmlUnitInfo_st 
+{
+    char name[96];                      //!< Product name
+    char id[96];                        //!< Product identifier
+    char serial[96];                    //!< Product serial number
+    char firmwareVersion[96];           //!< Firmware version
+} nvmlUnitInfo_t;
+
+/** 
+ * Power usage information for an S-class unit.
+ * The power supply state is a human readable string that equals "Normal" or contains
+ * a combination of "Abnormal" plus one or more of the following:
+ *    
+ *    - High voltage
+ *    - Fan failure
+ *    - Heatsink temperature
+ *    - Current limit
+ *    - Voltage below UV alarm threshold
+ *    - Low-voltage
+ *    - SI2C remote off command
+ *    - MOD_DISABLE input
+ *    - Short pin transition 
+*/
+typedef struct nvmlPSUInfo_st 
+{
+    char state[256];                 //!< The power supply state
+    unsigned int current;            //!< PSU current (A)
+    unsigned int voltage;            //!< PSU voltage (V)
+    unsigned int power;              //!< PSU power draw (W)
+} nvmlPSUInfo_t;
+
+/** 
+ * Fan speed reading for a single fan in an S-class unit.
+ */
+typedef struct nvmlUnitFanInfo_st 
+{
+    unsigned int speed;              //!< Fan speed (RPM)
+    nvmlFanState_t state;            //!< Flag that indicates whether fan is working properly
+} nvmlUnitFanInfo_t;
+
+/** 
+ * Fan speed readings for an entire S-class unit.
+ */
+typedef struct nvmlUnitFanSpeeds_st 
+{
+    nvmlUnitFanInfo_t fans[24];      //!< Fan speed data for each fan
+    unsigned int count;              //!< Number of fans in unit
+} nvmlUnitFanSpeeds_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @addtogroup nvmlEvents 
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** 
+ * Handle to an event set
+ */
+typedef struct nvmlEventSet_st* nvmlEventSet_t;
+
+/** @defgroup nvmlEventType Event Types
+ * @{
+ * Event Types which user can be notified about.
+ * See description of particular functions for details.
+ *
+ * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices 
+ * support each event.
+ *
+ * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents
+ */
+//! Event about single bit ECC errors
+/**
+ * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event
+ */
+#define nvmlEventTypeSingleBitEccError     0x0000000000000001LL
+
+//! Event about double bit ECC errors
+/**
+ * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event
+ */
+#define nvmlEventTypeDoubleBitEccError     0x0000000000000002LL
+
+//! Event about PState changes
+/**
+ *  \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to
+ *  no work being executed on the GPU, power capping or thermal capping. In a typical situation,
+ *  Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
+ */
+#define nvmlEventTypePState                0x0000000000000004LL
+
+//! Event that Xid critical error occurred
+#define nvmlEventTypeXidCriticalError      0x0000000000000008LL
+
+//! Event about clock changes
+/**
+ * Kepler only
+ */
+#define nvmlEventTypeClock                 0x0000000000000010LL
+
+//! Mask with no events
+#define nvmlEventTypeNone                  0x0000000000000000LL
+//! Mask of all events
+#define nvmlEventTypeAll (nvmlEventTypeNone    \
+        | nvmlEventTypeSingleBitEccError       \
+        | nvmlEventTypeDoubleBitEccError       \
+        | nvmlEventTypePState                  \
+        | nvmlEventTypeClock                   \
+        | nvmlEventTypeXidCriticalError        \
+        )
+/** @} */
+
+/** 
+ * Information about occurred event
+ */
+typedef struct nvmlEventData_st
+{
+    nvmlDevice_t        device;         //!< Specific device where the event occurred
+    unsigned long long  eventType;      //!< Information about what specific event occurred
+    unsigned long long  eventData;      //!< Stores last XID error for the device in the event of nvmlEventTypeXidCriticalError, 
+                                        //  eventData is 0 for any other event. eventData is set as 999 for unknown xid error.
+} nvmlEventData_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @addtogroup nvmlClocksThrottleReasons
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** Nothing is running on the GPU and the clocks are dropping to Idle state
+ * \note This limiter may be removed in a later release
+ */
+#define nvmlClocksThrottleReasonGpuIdle                   0x0000000000000001LL
+
+/** GPU clocks are limited by current setting of applications clocks
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetApplicationsClock
+ */
+#define nvmlClocksThrottleReasonApplicationsClocksSetting   0x0000000000000002LL
+
+/** 
+ * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting 
+ *             as the name describes the situation more accurately.
+ */
+#define nvmlClocksThrottleReasonUserDefinedClocks         nvmlClocksThrottleReasonApplicationsClocksSetting 
+
+/** SW Power Scaling algorithm is reducing the clocks below requested clocks 
+ *
+ * @see nvmlDeviceGetPowerUsage
+ * @see nvmlDeviceSetPowerManagementLimit
+ * @see nvmlDeviceGetPowerManagementLimit
+ */
+#define nvmlClocksThrottleReasonSwPowerCap                0x0000000000000004LL
+
+/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
+ * 
+ * This is an indicator of:
+ *   - temperature being too high
+ *   - External Power Brake Assertion is triggered (e.g. by the system power supply)
+ *   - Power draw is too high and Fast Trigger protection is reducing the clocks
+ *   - May be also reported during PState or clock change
+ *      - This behavior may be removed in a later release.
+ *
+ * @see nvmlDeviceGetTemperature
+ * @see nvmlDeviceGetTemperatureThreshold
+ * @see nvmlDeviceGetPowerUsage
+ */
+#define nvmlClocksThrottleReasonHwSlowdown                0x0000000000000008LL
+
+/** Sync Boost
+ *
+ * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
+ * order to maximize performance per watt. All GPUs in the sync boost group
+ * will boost to the minimum possible clocks across the entire group. Look at
+ * the throttle reasons for other GPUs in the system to see why those GPUs are
+ * holding this one at lower clocks.
+ *
+ */
+#define nvmlClocksThrottleReasonSyncBoost                 0x0000000000000010LL
+
+/** Some other unspecified factor is reducing the clocks */
+#define nvmlClocksThrottleReasonUnknown                   0x8000000000000000LL
+
+/** Bit mask representing no clocks throttling
+ *
+ * Clocks are as high as possible.
+ * */
+#define nvmlClocksThrottleReasonNone                      0x0000000000000000LL
+
+/** Bit mask representing all supported clocks throttling reasons 
+ * New reasons might be added to this list in the future
+ */
+#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \
+      | nvmlClocksThrottleReasonGpuIdle                           \
+      | nvmlClocksThrottleReasonApplicationsClocksSetting         \
+      | nvmlClocksThrottleReasonSwPowerCap                        \
+      | nvmlClocksThrottleReasonHwSlowdown                        \
+      | nvmlClocksThrottleReasonSyncBoost                         \
+      | nvmlClocksThrottleReasonUnknown                           \
+        ) 
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlAccountingStats Accounting Statistics
+ *  @{
+ *
+ *  Set of APIs designed to provide per process information about usage of GPU.
+ *
+ *  @note All accounting statistics and accounting mode live in nvidia driver and reset 
+ *        to default (Disabled) when driver unloads.
+ *        It is advised to run with persistence mode enabled.
+ *
+ *  @note Enabling accounting mode has no negative impact on the GPU performance.
+ */
+/***************************************************************************************************/
+
+/**
+ * Describes accounting statistics of a process.
+ */
+typedef struct nvmlAccountingStats_st {
+    unsigned int gpuUtilization;                //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
+                                                //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
+                                                //! process (not just the last sample period).
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
+    
+    unsigned int memoryUtilization;             //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
+    
+    unsigned long long maxMemoryUsage;          //!< Maximum total memory in bytes that was ever allocated by the process.
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported
+    
+
+    unsigned long long time;                    //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if 
+                                                //!< the process is not terminated
+    
+    unsigned long long startTime;               //!< CPU Timestamp in usec representing start time for the process
+    
+    unsigned int isRunning;                     //!< Flag to represent if the process is running (1 for running, 0 for terminated)
+
+    unsigned int reserved[5];                   //!< Reserved for future use
+} nvmlAccountingStats_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup
+ * This chapter describes the methods that handle NVML initialization and cleanup.
+ * It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and 
+ * nvmlShutdown() once NVML is no longer being used.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Initialize NVML, but don't initialize any GPUs yet.
+ *
+ * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that
+ *       did initialize all GPU devices in the system.
+ *       
+ * This allows NVML to communicate with a GPU
+ * when other GPUs in the system are unstable or in a bad state.  When using this API, GPUs are
+ * discovered and initialized in nvmlDeviceGetHandleBy* functions instead.
+ * 
+ * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in
+ *       a bad or unstable state.
+ * 
+ * For all products.
+ *
+ * This method, should be called once before invoking any other methods in the library.
+ * A reference count of the number of initializations is maintained.  Shutdown only occurs
+ * when the reference count reaches zero.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
+ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
+ *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlInit(void);
+
+/**
+ * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit().
+ * 
+ * For all products.
+ *
+ * This method should be called after NVML work is done, once for each call to \ref nvmlInit()
+ * A reference count of the number of initializations is maintained.  Shutdown only occurs
+ * when the reference count reaches zero.  For backwards compatibility, no error is reported if
+ * nvmlShutdown() is called more times than nvmlInit().
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if NVML has been properly shut down
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlShutdown(void);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlErrorReporting Error reporting
+ * This chapter describes helper functions for error reporting routines.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Helper method for converting NVML error codes into readable strings.
+ *
+ * For all products.
+ *
+ * @param result                               NVML error code to convert
+ *
+ * @return String representation of the error.
+ *
+ */
+const DECLDIR char* nvmlErrorString(nvmlReturn_t result);
+/** @} */
+
+
+/***************************************************************************************************/
+/** @defgroup nvmlConstants Constants
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion
+ */
+#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE       16
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
+ */
+#define NVML_DEVICE_UUID_BUFFER_SIZE                  80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber
+ */
+#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE           80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion
+ */
+#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE        80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion
+ */
+#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE          80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
+ */
+#define NVML_DEVICE_NAME_BUFFER_SIZE                  64
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial
+ */
+#define NVML_DEVICE_SERIAL_BUFFER_SIZE                30
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion
+ */
+#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE         32
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlSystemQueries System Queries
+ * This chapter describes the queries that NVML can perform against the local system. These queries
+ * are not device-specific.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieves the version of the system's graphics driver.
+ * 
+ * For all products.
+ *
+ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
+ *
+ * @param version                              Reference in which to return the version identifier
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length);
+
+/**
+ * Retrieves the version of the NVML library.
+ * 
+ * For all products.
+ *
+ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE.
+ *
+ * @param version                              Reference in which to return the version identifier
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length);
+
+/**
+ * Gets name of the process with provided process id
+ *
+ * For all products.
+ *
+ * Returned process name is cropped to provided length.
+ * name string is encoded in ANSI.
+ *
+ * @param pid                                  The identifier of the process
+ * @param name                                 Reference in which to return the process name
+ * @param length                               The maximum allowed length of the string returned in \a name
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a name is NULL or \a length is 0.
+ *         - \ref NVML_ERROR_NOT_FOUND         if process doesn't exists
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitQueries Unit Queries
+ * This chapter describes that queries that NVML can perform against each unit. For S-class systems only.
+ * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by 
+ * calling \ref nvmlUnitGetHandleByIndex().
+ *  @{
+ */
+/***************************************************************************************************/
+
+ /**
+ * Retrieves the number of units in the system.
+ *
+ * For S-class products.
+ *
+ * @param unitCount                            Reference in which to return the number of units
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a unitCount has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unitCount is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount);
+
+/**
+ * Acquire the handle for a particular unit, based on its index.
+ *
+ * For S-class products.
+ *
+ * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). 
+ *   For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
+ *
+ * The order in which NVML enumerates units has no guarantees of consistency between reboots.
+ *
+ * @param index                                The index of the target unit, >= 0 and < \a unitCount
+ * @param unit                                 Reference in which to return the unit handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a unit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a index is invalid or \a unit is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
+
+/**
+ * Retrieves the static information associated with a unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitInfo_t for details on available unit info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param info                                 Reference in which to return the unit information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a info has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a info is NULL
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
+
+/**
+ * Retrieves the LED state associated with this unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlLedState_t for details on allowed states.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param state                                Reference in which to return the current LED state
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a state has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a state is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlUnitSetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
+
+/**
+ * Retrieves the PSU stats for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlPSUInfo_t for details on available PSU info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param psu                                  Reference in which to return the PSU information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a psu has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a psu is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
+
+/**
+ * Retrieves the temperature readings for the unit, in degrees C.
+ *
+ * For S-class products.
+ *
+ * Depending on the product, readings may be available for intake (type=0), 
+ * exhaust (type=1) and board (type=2).
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param type                                 The type of reading to take
+ * @param temp                                 Reference in which to return the intake temperature
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a temp has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a type is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
+
+/**
+ * Retrieves the fan speed readings for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param fanSpeeds                            Reference in which to return the fan speed information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a fanSpeeds has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a fanSpeeds is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
+
+/**
+ * Retrieves the set of GPU devices that are attached to the specified unit.
+ *
+ * For S-class products.
+ *
+ * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param deviceCount                          Reference in which to provide the \a devices array size, and
+ *                                             to return the number of attached GPU devices
+ * @param devices                              Reference in which to return the references to the attached GPU devices
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a deviceCount and \a devices have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid, either of \a deviceCount or \a devices is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
+
+/**
+ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
+ * 
+ * For S-class products.
+ *
+ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
+ * The HIC must be connected to an S-class system for it to be reported by this function.
+ *
+ * @param hwbcCount                            Size of hwbcEntries array
+ * @param hwbcEntries                          Array holding information about hwbc
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a hwbcCount and \a hwbcEntries have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if either \a hwbcCount or \a hwbcEntries is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceQueries Device Queries
+ * This chapter describes that queries that NVML can perform against each device.
+ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by  
+ * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(),
+ * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). 
+ *  @{
+ */
+/***************************************************************************************************/
+
+ /**
+ * Retrieves the number of compute devices in the system. A compute device is a single GPU.
+ * 
+ * For all products.
+ *
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ *       library.
+ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ * @param deviceCount                          Reference in which to return the number of accessible devices
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
+
+/**
+ * Acquire the handle for a particular device, based on its index.
+ * 
+ * For all products.
+ *
+ * Valid indices are derived from the \a accessibleDevices count returned by 
+ *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
+ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ *   is recommended that devices be looked up by their PCI ids or UUID. See 
+ *   \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ *  - The target GPU is an SLI slave
+ * 
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ *       library.
+ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ *       This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
+ *       If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
+ *       need to worry about that.
+ *
+ * @param index                                The index of the target GPU, >= 0 and < \a accessibleDevices
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a device is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetIndex
+ * @see nvmlDeviceGetCount
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its board serial number.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This number corresponds to the value printed directly on the board, and to the value returned by
+ *   \ref nvmlDeviceGetSerial().
+ *
+ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor 
+ *             of \ref nvmlDeviceGetHandleByUUID.
+ *             For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @param serial                               The board serial number of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a serial is invalid, \a device is NULL or more than one
+ *                                              device has the same serial (dual GPU boards)
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a serial does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetSerial
+ * @see nvmlDeviceGetHandleByUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
+ *
+ * For all products.
+ *
+ * @param uuid                                 The UUID of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a uuid is invalid or \a device is null
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a uuid does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its PCI bus id.
+ * 
+ * For all products.
+ *
+ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo().
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ *  - The target GPU is an SLI slave
+ *
+ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND 
+ *       instead of NVML_ERROR_NO_PERMISSION.
+ *
+ * @param pciBusId                             The PCI bus id of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a pciBusId is invalid or \a device is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a pciBusId does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
+ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
+
+/**
+ * Retrieves the name of this device. 
+ * 
+ * For all products.
+ *
+ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
+ * exceed 64 characters in length (including the NULL terminator).  See \ref
+ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param name                                 Reference in which to return the product name
+ * @param length                               The maximum allowed length of the string returned in \a name
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
+
+/**
+ * Retrieves the brand of this device.
+ *
+ * For all products.
+ *
+ * The type is a member of \ref nvmlBrandType_t defined above.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Reference in which to return the product brand type
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a type is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
+
+/**
+ * Retrieves the NVML index of this device.
+ *
+ * For all products.
+ * 
+ * Valid indices are derived from the \a accessibleDevices count returned by 
+ *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
+ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ *   is recommended that devices be looked up by their PCI ids or GPU UUID. See 
+ *   \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * @param device                               The identifier of the target device
+ * @param index                                Reference in which to return the NVML index of the device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a index has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a index is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetHandleByIndex()
+ * @see nvmlDeviceGetCount()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
+
+/**
+ * Retrieves the globally unique board serial number associated with this device's board.
+ *
+ * For all products with an inforom.
+ *
+ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
+ * This number matches the serial number tag that is physically attached to the board.  See \ref
+ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param serial                               Reference in which to return the board/module serial number
+ * @param length                               The maximum allowed length of the string returned in \a serial
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a serial has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a serial is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
+
+/**
+ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
+ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
+ *     result[0] = 0x3, result[1] = 0x3
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ * @param cpuSetSize                           The size of the cpuSet array that is safe to access
+ * @param cpuSet                               Array reference in which to return a bitmask of CPUs, 64 CPUs per 
+ *                                                 unsigned long on 64-bit machines, 32 on 32-bit machines
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a cpuAffinity has been filled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
+
+/**
+ * Sets the ideal affinity for the calling thread and device using the guidelines 
+ * given in nvmlDeviceGetCpuAffinity().  Note, this is a change as of version 8.0.  
+ * Older versions set the affinity for a calling process and all children.
+ * Currently supports up to 64 processors.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the calling process has been successfully bound
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Clear all affinity bindings for the calling thread.  Note, this is a change as of version
+ * 8.0 as older versions cleared the affinity for a calling process and all children.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the calling process has been successfully unbound
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Retrieve the common ancestor for two devices
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device1                              The identifier of the first device
+ * @param device2                              The identifier of the second device
+ * @param pathInfo                             A \ref nvmlGpuTopologyLevel_t that gives the path type
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a pathInfo has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
+
+/**
+ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the first device
+ * @param level                                The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
+ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
+ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
+ *                                             number of device handles.
+ * @param deviceArray                          An array of device handles for GPUs found at \a level
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param cpuNumber                            The CPU number
+ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
+ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
+ *                                             number of device handles.
+ * @param deviceArray                          An array of device handles for GPUs found with affinity to \a cpuNumber
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
+ * that augments the immutable, board serial identifier.
+ *
+ * For all products.
+ *
+ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
+ * It does NOT correspond to any identifier printed on the board.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param uuid                                 Reference in which to return the GPU UUID
+ * @param length                               The maximum allowed length of the string returned in \a uuid
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a uuid has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a uuid is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
+
+/**
+ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for 
+ * each GPU will have the form /dev/nvidia[minor number].
+ *
+ * For all products.
+ * Supported only for Linux
+ *
+ * @param device                                The identifier of the target device
+ * @param minorNumber                           Reference in which to return the minor number for the device
+ * @return
+ *         - \ref NVML_SUCCESS                 if the minor number is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minorNumber is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
+
+/**
+ * Retrieves the the device board part number which is programmed into the board's InfoROM
+ *
+ * For all products.
+ *
+ * @param device                                Identifier of the target device
+ * @param partNumber                            Reference to the buffer to return
+ * @param length                                Length of the buffer reference
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a partNumber has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_SUPPORTED      if the needed VBIOS fields have not been filled
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a serial is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
+
+/**
+ * Retrieves the version information for the device's infoROM object.
+ *
+ * For all products with an inforom.
+ *
+ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate 
+ * ECC counts. The version of the data structures in this memory may change from time to time. It will not
+ * exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * See \ref nvmlInforomObject_t for details on the available infoROM objects.
+ *
+ * @param device                               The identifier of the target device
+ * @param object                               The target infoROM object
+ * @param version                              Reference in which to return the infoROM version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomImageVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
+
+/**
+ * Retrieves the global infoROM image version
+ *
+ * For all products with an inforom.
+ *
+ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board 
+ * in contrast to infoROM object version which is only an indicator of supported features.
+ * Version string will not exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param version                              Reference in which to return the infoROM image version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Retrieves the checksum of the configuration stored in the device's infoROM.
+ *
+ * For all products with an inforom.
+ *
+ * Can be used to make sure that two GPUs have the exact same configuration.
+ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
+ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
+ *
+ * @param device                               The identifier of the target device
+ * @param checksum                             Reference in which to return the infoROM configuration checksum
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a checksum has been set
+ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a checksum is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
+
+/**
+ * Reads the infoROM from the flash and verifies the checksums.
+ *
+ * For all products with an inforom.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if infoROM is not corrupted
+ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
+
+/**
+ * Retrieves the display mode for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a physical display (e.g. monitor) is currently connected to
+ * any of the device's connectors.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param display                              Reference in which to return the display mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a display has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a display is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
+
+/**
+ * Retrieves the display active state for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a display is initialized on the device.
+ * For example whether X Server is attached to this device and has allocated memory for the screen.
+ *
+ * Display can be active even when no monitor is physically attached.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param isActive                             Reference in which to return the display active state
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a isActive has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isActive is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the persistence mode associated with this device.
+ *
+ * For all products.
+ * For Linux only.
+ *
+ * When driver persistence mode is enabled the driver software state is not torn down when the last 
+ * client disconnects. By default this feature is disabled. 
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current driver persistence mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the PCI attributes of this device.
+ * 
+ * For all products.
+ *
+ * See \ref nvmlPciInfo_t for details on the available PCI info.
+ *
+ * @param device                               The identifier of the target device
+ * @param pci                                  Reference in which to return the PCI info
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pci has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pci is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the maximum PCIe link generation possible with this device and system
+ *
+ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
+ * report is generation 1.
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param maxLinkGen                           Reference in which to return the max PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a maxLinkGen has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkGen is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
+
+/**
+ * Retrieves the maximum PCIe link width possible with this device and system
+ *
+ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
+ * a max link width of 8.
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param maxLinkWidth                         Reference in which to return the max PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a maxLinkWidth has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkWidth is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
+
+/**
+ * Retrieves the current PCIe link generation
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param currLinkGen                          Reference in which to return the current PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a currLinkGen has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkGen is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
+
+/**
+ * Retrieves the current PCIe link width
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param currLinkWidth                        Reference in which to return the current PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a currLinkWidth has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkWidth is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
+
+/**
+ * Retrieve PCIe utilization information.
+ * This function is querying a byte counter over a 20ms interval and thus is the 
+ *   PCIe throughput over that interval.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * This method is not supported on virtualized GPU environments.
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              The specific counter that should be queried \ref nvmlPcieUtilCounter_t
+ * @param value                                Reference in which to return throughput in KB/s
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a value has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a counter is invalid, or \a value is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
+
+/**  
+ * Retrieve the PCIe replay counter.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param value                                Reference in which to return the counter's value
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a value and \a rollover have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a value or \a rollover are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
+
+/**
+ * Retrieves the current clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Identify which clock domain to query
+ * @param clock                                Reference in which to return the clock speed in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clock has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the maximum clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
+ *       by few MHz.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Identify which clock domain to query
+ * @param clock                                Reference in which to return the clock speed in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clock has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
+ * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the default applications clock that GPU boots with or 
+ * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the default clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * \see nvmlDeviceGetApplicationsClock
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Resets the application clock to the default value
+ *
+ * This is the applications clock that will be used after system reboot or driver reload.
+ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * @see nvmlDeviceGetApplicationsClock
+ * @see nvmlDeviceSetApplicationsClocks
+ *
+ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ *
+ * @param device                               The identifier of the target device
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
+
+/**
+ * Retrieves the clock speed for the clock specified by the clock type and clock ID.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockId                              Identify which clock in the domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
+
+/**
+ * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or the \a clockType on this device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param count                                Reference in which to provide the \a clocksMHz array size, and
+ *                                             to return the number of elements
+ * @param clocksMHz                            Reference in which to return the clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
+ *                                                required elements)
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedGraphicsClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param memoryClockMHz                       Memory clock for which to return possible graphics clocks
+ * @param count                                Reference in which to provide the \a clocksMHz array size, and
+ *                                             to return the number of elements
+ * @param clocksMHz                            Reference in which to return the clocks in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_FOUND         if the specified \a memoryClockMHz is not a supported frequency
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small 
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedMemoryClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieve the current state of auto boosted clocks on a device and store it in \a isEnabled
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow.
+ *
+ * @param device                               The identifier of the target device
+ * @param isEnabled                            Where to store the current state of auto boosted clocks of the target device
+ * @param defaultIsEnabled                     Where to store the default auto boosted clocks behavior of the target device that the device will
+ *                                                 revert to when no applications are using the GPU
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If \a isEnabled has been been set with the auto boosted clocks state of \a device
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isEnabled is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support auto boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
+
+/**
+ * Try to set the current state of auto boosted clocks on a device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ * Non-root users may use this API by default but can be restricted by root from using this API by calling
+ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
+ * Note: Persistence Mode is required to modify current Auto boost settings, therefore, it must be enabled.
+ *
+ * @param device                               The identifier of the target device
+ * @param enabled                              What state to try to set auto boosted clocks of the target device to
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If the auto boosted clocks were successfully set to the state specified by \a enabled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support auto boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
+
+/**
+ * Try to set the default state of auto boosted clocks on a device. This is the default state that auto boosted clocks will
+ * return to when no compute running processes (e.g. CUDA application which have an active context) are running
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions.
+ *
+ * Auto boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ *
+ * @param device                               The identifier of the target device
+ * @param enabled                              What state to try to set default auto boosted clocks of the target device to
+ * @param flags                                Flags that change the default behavior. Currently Unused.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If the auto boosted clock's default state was successfully set to the state specified by \a enabled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NO_PERMISSION     If the calling user does not have permission to change auto boosted clock's default state.
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support auto boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
+
+
+/**
+ * Retrieves the intended operating speed of the device's fan.
+ *
+ * Note: The reported speed is the intended fan speed.  If the fan is physically blocked and unable to spin, the
+ * output will not match the actual fan speed.
+ * 
+ * For all discrete products with dedicated fans.
+ *
+ * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%.
+ *
+ * @param device                               The identifier of the target device
+ * @param speed                                Reference in which to return the fan speed percentage
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a speed has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a speed is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a fan
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
+
+/**
+ * Retrieves the current temperature readings for the device, in degrees C. 
+ * 
+ * For all products.
+ *
+ * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
+ *
+ * @param device                               The identifier of the target device
+ * @param sensorType                           Flag that indicates which sensor reading to retrieve
+ * @param temp                                 Reference in which to return the temperature reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a temp has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a sensorType is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have the specified sensor
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
+
+/**
+ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
+ *
+ * @param device                               The identifier of the target device
+ * @param thresholdType                        The type of threshold value queried
+ * @param temp                                 Reference in which to return the temperature reading
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a temp has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a temperature sensor or is unsupported
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
+
+/**
+ * Retrieves the current performance state for the device. 
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device                               The identifier of the target device
+ * @param pState                               Reference in which to return the performance state reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pState has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * Retrieves current clocks throttling reasons.
+ *
+ * For all fully supported products.
+ *
+ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
+ *
+ * @param device                                The identifier of the target device
+ * @param clocksThrottleReasons                 Reference in which to return bitmask of active clocks throttle
+ *                                                  reasons
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clocksThrottleReasons has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clocksThrottleReasons is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetSupportedClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
+
+/**
+ * Retrieves bitmask of supported clocks throttle reasons that can be returned by 
+ * \ref nvmlDeviceGetCurrentClocksThrottleReasons
+ *
+ * For all fully supported products.
+ *
+ * This method is not supported on virtualized GPU environments.
+ *
+ * @param device                               The identifier of the target device
+ * @param supportedClocksThrottleReasons       Reference in which to return bitmask of supported
+ *                                              clocks throttle reasons
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a supportedClocksThrottleReasons has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a supportedClocksThrottleReasons is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetCurrentClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
+
+/**
+ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
+ *
+ * Retrieve the current performance state for the device. 
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device                               The identifier of the target device
+ * @param pState                               Reference in which to return the performance state reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pState has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * This API has been deprecated.
+ *
+ * Retrieves the power management mode associated with this device.
+ *
+ * For products from the Fermi family.
+ *     - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
+ *
+ * For from the Kepler or newer families.
+ *     - Does not require \a NVML_INFOROM_POWER object.
+ *
+ * This flag indicates whether any power management algorithm is currently active on the device. An 
+ * enabled state does not necessarily mean the device is being actively throttled -- only that 
+ * that the driver will do so if the appropriate conditions are met.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current power management mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the power management limit associated with this device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * The power limit defines the upper boundary for the card's power draw. If
+ * the card's total power draw reaches this limit the power management algorithm kicks in.
+ *
+ * This reading is only available if power management mode is supported. 
+ * See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device                               The identifier of the target device
+ * @param limit                                Reference in which to return the power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves information about possible values of power management limits on this device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param minLimit                             Reference in which to return the minimum power management limit in milliwatts
+ * @param maxLimit                             Reference in which to return the maximum power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a minLimit and \a maxLimit have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minLimit or \a maxLimit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetPowerManagementLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
+
+/**
+ * Retrieves default power management limit on this device, in milliwatts.
+ * Default power management limit is a power management limit that the device boots with.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param defaultLimit                         Reference in which to return the default power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a defaultLimit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
+
+/**
+ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
+ *
+ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device                               The identifier of the target device
+ * @param power                                Reference in which to return the power usage information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a power has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a power is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support power readings
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
+
+/**
+ * Get the effective power limit that the driver enforces after taking into account all limiters
+ *
+ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
+ * This includes the out of band power limit interface
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                           The device to communicate with
+ * @param limit                            Reference in which to return the power management limit in milliwatts
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current GOM
+ * @param pending                              Reference in which to return the pending GOM
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a current or \a pending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceSetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
+
+/**
+ * Retrieves the amount of used, free and total memory available on the device, in bytes.
+ * 
+ * For all products.
+ *
+ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
+ * Under WDDM most device memory is allocated and managed on startup by Windows.
+ *
+ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated 
+ * by all active channels on the device.
+ *
+ * See \ref nvmlMemory_t for details on available memory info.
+ *
+ * @param device                               The identifier of the target device
+ * @param memory                               Reference in which to return the memory information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a memory has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
+
+/**
+ * Retrieves the current compute mode for the device.
+ *
+ * For all products.
+ *
+ * See \ref nvmlComputeMode_t for details on allowed compute modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current compute mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
+
+/**
+ * Retrieves the current and pending ECC modes for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ *
+ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
+ * the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current ECC mode
+ * @param pending                              Reference in which to return the pending ECC mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a current and \a pending have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or either \a current or \a pending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
+
+/**
+ * Retrieves the device boardId from 0-N.
+ * Devices with the same boardId indicate GPUs connected to the same PLX.  Use in conjunction with 
+ *  \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
+ *  The boardId returned is a unique ID for the current configuration.  Uniqueness and ordering across 
+ *  reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
+ *  the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will 
+ *  always return those values but they will always be different from each other).
+ *  
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param boardId                              Reference in which to return the device's board ID
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a boardId has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a boardId is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
+
+/**
+ * Retrieves whether the device is on a Multi-GPU Board
+ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param multiGpuBool                         Reference in which to return a zero or non-zero value
+ *                                                 to indicate whether the device is on a multi GPU board
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a multiGpuBool has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a multiGpuBool is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
+
+/**
+ * Retrieves the total ECC error counts for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires ECC Mode to be enabled.
+ *
+ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of 
+ * errors across the entire device.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.
+ *
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of the errors. 
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param eccCounts                            Reference in which to return the specified ECC errors
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a eccCounts has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
+
+/**
+ * Retrieves the detailed ECC error counts for the device.
+ *
+ * @deprecated   This API supports only a fixed set of ECC error locations
+ *               On different GPU architectures different locations are supported
+ *               See \ref nvmlDeviceGetMemoryErrorCounter
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
+ * Requires ECC Mode to be enabled.
+ *
+ * Detailed errors provide separate ECC counts for specific parts of the memory system.
+ *
+ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
+ *
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of the errors. 
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param eccCounts                            Reference in which to return the specified ECC errors
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a eccCounts has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
+
+/**
+ * Retrieves the requested memory error counter for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
+ *
+ * Only applicable to devices with ECC.
+ *
+ * Requires ECC Mode to be enabled.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
+ * 
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of error.
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param locationType                         Specifies the location of the counter. 
+ * @param count                                Reference in which to return the ECC counter
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a bitTyp,e \a counterType or \a locationType is
+ *                                             invalid, or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support ECC error reporting in the specified memory
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
+                                                   nvmlEccCounterType_t counterType,
+                                                   nvmlMemoryLocation_t locationType, unsigned long long *count);
+
+/**
+ * Retrieves the current utilization rates for the device's major subsystems.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlUtilization_t for details on available utilization rates.
+ *
+ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
+ *       This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference in which to return the utilization information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a utilization is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Encoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference to an unsigned int for encoder utilization info
+ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Decoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference to an unsigned int for decoder utilization info
+ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+ * Retrieves the current and pending driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current driver model
+ * @param pending                              Reference in which to return the pending driver model
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if either \a current and/or \a pending have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or both \a current and \a pending are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceSetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
+
+/**
+ * Get VBIOS version of the device.
+ *
+ * For all products.
+ *
+ * The VBIOS version may change from time to time. It will not exceed 32 characters in length 
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param version                              Reference to which to return the VBIOS version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Get Bridge Chip Information for all the bridge chips on the board.
+ * 
+ * For all fully supported products.
+ * Only applicable to multi-GPU products.
+ * 
+ * @param device                                The identifier of the target device
+ * @param bridgeHierarchy                       Reference to the returned bridge chip Hierarchy
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if bridge chip exists
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a bridgeInfo is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if bridge chip not supported on the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
+
+/**
+ * Get information about processes with a compute context on a device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This function returns information only about compute running processes (e.g. CUDA application which have
+ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
+ *
+ * To query the current number of running compute processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new compute processes are spawned.
+ *
+ * @param device                               The identifier of the target device
+ * @param infoCount                            Reference in which to provide the \a infos array size, and
+ *                                             to return the number of returned elements
+ * @param infos                                Reference in which to return the process information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ *                                             \a infoCount will contain minimal amount of space necessary for
+ *                                             the call to complete
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Get information about processes with a graphics context on a device
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This function returns information only about graphics based processes 
+ * (eg. applications using OpenGL, DirectX)
+ *
+ * To query the current number of running graphics processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new graphics processes are spawned.
+ *
+ * @param device                               The identifier of the target device
+ * @param infoCount                            Reference in which to provide the \a infos array size, and
+ *                                             to return the number of returned elements
+ * @param infos                                Reference in which to return the process information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ *                                             \a infoCount will contain minimal amount of space necessary for
+ *                                             the call to complete
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Check if the GPU devices are on the same physical board.
+ *
+ * For all fully supported products.
+ *
+ * @param device1                               The first GPU device
+ * @param device2                               The second GPU device
+ * @param onSameBoard                           Reference in which to return the status.
+ *                                              Non-zero indicates that the GPUs are on the same board.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a onSameBoard has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this check is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the either GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
+
+/**
+ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
+ *
+ * For all fully supported products.
+ *
+ * @param device                               The identifier of the target device
+ * @param apiType                              Target API type for this operation
+ * @param isRestricted                         Reference in which to return the current restriction 
+ *                                             NVML_FEATURE_ENABLED indicates that the API is root-only
+ *                                             NVML_FEATURE_DISABLED indicates that the API is accessible to all users
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device or the device does not support
+ *                                                 the feature that is being queried (E.G. Enabling/disabling auto boosted clocks is
+ *                                                 not supported by the device)
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
+
+/**
+ * Gets recent samples for the GPU.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * 
+ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by 
+ * the driver.
+ * 
+ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
+ * 
+ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. 
+ * The returned samplesCount will provide the number of samples that can be queried. The user needs to 
+ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
+ * 
+ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the 
+ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query 
+ * to get more recent samples.
+ * 
+ * This method fetches the number of entries which can be accommodated in the provided samples array, and the 
+ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this 
+ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
+ * 
+ * @param device                        The identifier for the target device
+ * @param type                          Type of sampling event
+ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp. 
+ * @param sampleValType                 Output parameter to represent the type of sample value as described in nvmlSampleVal_t
+ * @param sampleCount                   Reference to provide the number of elements which can be queried in samples array
+ * @param samples                       Reference in which samples are returned
+ 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if samples are successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a samplesCount is NULL or 
+ *                                             reference to \a sampleCount is 0 for non null \a samples
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
+        nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
+
+/**
+ * Gets Total, Available and Used size of BAR1 memory.
+ * 
+ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party 
+ * devices (peer-to-peer on the PCIE bus). 
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param bar1Memory                           Reference in which BAR1 memory
+ *                                             information is returned.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if BAR1 memory is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a bar1Memory is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
+
+
+/**
+ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power 
+ * or thermal constraints.
+ *
+ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
+ * difference in violation times at two different reference times gives the indication of GPU throttling event. 
+ *
+ * Violation for thermal capping is not supported at this time.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param perfPolicyType                       Represents Performance policy which can trigger GPU throttling
+ * @param violTime                             Reference to which violation time related information is returned 
+ *                                         
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if violation time is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
+
+/**
+ * @}
+ */
+
+/** @addtogroup nvmlAccountingStats
+ *  @{
+ */
+
+/**
+ * Queries the state of per process accounting mode.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlDeviceGetAccountingStats for more details.
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current accounting mode
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Queries process's accounting stats.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * 
+ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
+ * Accounting stats can be queried during life time of the process and after its termination.
+ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and 
+ * updated to actual running time after its termination.
+ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
+ * processes.
+ *
+ * See \ref nvmlAccountingStats_t for description of each returned metric.
+ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
+ *
+ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
+ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
+ *         queried since they don't contribute to GPU utilization.
+ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
+ *
+ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
+ * 
+ * @param device                               The identifier of the target device
+ * @param pid                                  Process Id of the target process to query stats for
+ * @param stats                                Reference in which to return the process's accounting stats
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a stats are NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if process stats were not found
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
+
+/**
+ * Queries list of processes that can be queried for accounting stats. The list of processes returned 
+ * can be in running or terminated state.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * To just query the number of processes ready to be queried, call this function with *count = 0 and
+ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
+ * 
+ * For more details see \ref nvmlDeviceGetAccountingStats.
+ *
+ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
+ *
+ * @param device                               The identifier of the target device
+ * @param count                                Reference in which to provide the \a pids array size, and
+ *                                               to return the number of elements ready to be queried
+ * @param pids                                 Reference in which to return list of process ids
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
+ *                                                 expected value)
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
+
+/**
+ * Returns the number of processes that the circular buffer with accounting pids can hold.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This is the maximum number of processes that accounting information will be stored for before information
+ * about oldest processes will get overwritten by information about new processes.
+ *
+ * @param device                               The identifier of the target device
+ * @param bufferSize                           Reference in which to provide the size (in number of elements)
+ *                                               of the circular buffer for accounting stats.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if buffer size was successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a bufferSize is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceGetAccountingStats
+ * @see nvmlDeviceGetAccountingPids
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
+
+/** @} */
+
+/** @addtogroup nvmlDeviceQueries
+ *  @{
+ */
+
+/**
+ * Returns the list of retired pages by source, including pages that are pending retirement
+ * The address information provided from this API is the hardware address of the page that was retired.  Note
+ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param cause                             Filter page addresses by cause of retirement
+ * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
+ *                                          to return the number of retired pages that match \a cause
+ *                                          Set to 0 to query the size without allocating an \a addresses buffer
+ * @param addresses                         Buffer to write the page addresses into
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
+ *                                             matching page addresses.  \a pageCount is set to the needed size.
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or 
+ *                                             \a addresses is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
+    unsigned int *pageCount, unsigned long long *addresses);
+
+/**
+ * Check if any pages are pending retirement and need a reboot to fully retire.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param isPending                         Reference in which to return the pending status
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isPending was populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isPending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitCommands Unit Commands
+ *  This chapter describes NVML operations that change the state of the unit. For S-class products.
+ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ *  error code when invoking any of these methods.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the LED state for the unit. The LED can be either green (0) or amber (1).
+ *
+ * For S-class products.
+ * Requires root/admin permissions.
+ *
+ * This operation takes effect immediately.
+ * 
+ *
+ * <b>Current S-Class products don't provide unique LEDs for each unit. As such, both front 
+ * and back LEDs will be toggled in unison regardless of which unit is specified with this command.</b>
+ *
+ * See \ref nvmlLedColor_t for available colors.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param color                                The target LED color
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the LED color has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a color is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlUnitGetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceCommands Device Commands
+ *  This chapter describes NVML operations that change the state of the device.
+ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ *  error code when invoking any of these methods.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the persistence mode for the device.
+ *
+ * For all products.
+ * For Linux only.
+ * Requires root/admin permissions.
+ *
+ * The persistence mode determines whether the GPU driver software is torn down after the last client
+ * exits.
+ *
+ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
+ * persistence mode is reset to "Disabled".
+ *
+ * See \ref nvmlEnableState_t for available modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target persistence mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the persistence mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Set the compute mode for the device.
+ *
+ * For all products.
+ * Requires root/admin permissions.
+ *
+ * The compute mode determines whether a GPU can be used for compute operations and whether it can
+ * be shared across contexts.
+ *
+ * This operation takes effect immediately. Under Linux it is not persistent across reboots and
+ * always resets to "Default". Under windows it is persistent.
+ *
+ * Under windows compute mode may only be set to DEFAULT when running in WDDM
+ *
+ * See \ref nvmlComputeMode_t for details on available compute modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target compute mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the compute mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
+
+/**
+ * Set the ECC mode for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires root/admin permissions.
+ *
+ * The ECC mode determines whether the GPU enables its ECC support.
+ *
+ * This operation takes effect after the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on available modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param ecc                                  The target ECC mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the ECC mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a ecc is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);  
+
+/**
+ * Clear the ECC error and other memory error counts for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
+ * Requires root/admin permissions.
+ * Requires ECC Mode to be enabled.
+ *
+ * Sets all of the specified ECC counters to 0, including both detailed and total counts.
+ *
+ * This operation takes effect immediately.
+ *
+ * See \ref nvmlMemoryErrorType_t for details on available counter types.
+ *
+ * @param device                               The identifier of the target device
+ * @param counterType                          Flag that indicates which type of errors should be cleared.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the error counts were cleared
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a counterType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see 
+ *      - nvmlDeviceGetDetailedEccErrors()
+ *      - nvmlDeviceGetTotalEccErrors()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
+
+/**
+ * Set the driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ * Requires root/admin permissions.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode.  
+ *
+ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
+ * This should only be done if the host is subsequently powered down and the display is detached from the device
+ * before the next reboot. 
+ *
+ * This operation takes effect after the next reboot.
+ * 
+ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
+ *
+ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or 
+ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ * See \ref nvmlFlagDefault and \ref nvmlFlagForce
+ *
+ * @param device                               The identifier of the target device
+ * @param driverModel                          The target driver model
+ * @param flags                                Flags that change the default behavior
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the driver model has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a driverModel is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows or the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceGetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
+
+/**
+ * Set clocks that applications will lock to.
+ *
+ * Sets the clocks that compute and graphics applications will be running at.
+ * e.g. CUDA driver requests these clocks during context creation which means this property 
+ * defines clocks at which CUDA applications will be running unless some overspec event
+ * occurs (e.g. over power, over thermal or external HW brake).
+ *
+ * Can be used as a setting to request constant performance.
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions. 
+ *
+ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks 
+ * for details on how to list available clocks combinations.
+ *
+ * After system reboot or driver reload applications clocks go back to their default value.
+ * See \ref nvmlDeviceResetApplicationsClocks.
+ *
+ * @param device                               The identifier of the target device
+ * @param memClockMHz                          Requested memory clock in MHz
+ * @param graphicsClockMHz                     Requested graphics clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memClockMHz and \a graphicsClockMHz 
+ *                                                 is not a valid clock combination
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
+
+/**
+ * Set new power limit of this device.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
+ *
+ * \note Limit is not persistent across reboots or driver unloads.
+ * Enable persistent mode to prevent driver from unloading when no application is using the device.
+ *
+ * @param device                               The identifier of the target device
+ * @param limit                                Power management limit in milliwatts to set
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is out of range
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetPowerManagementLimitConstraints
+ * @see nvmlDeviceGetPowerManagementDefaultLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
+
+/**
+ * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
+ * Requires root/admin permissions.
+ * 
+ * Changing GOMs requires a reboot. 
+ * The reboot requirement might be removed in the future.
+ *
+ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
+ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
+ * 
+ * @param device                               The identifier of the target device
+ * @param mode                                 Target GOM
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode incorrect
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support GOM or specific mode
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceGetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
+
+/**
+ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
+ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
+ * to query the current restriction settings.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @param device                               The identifier of the target device
+ * @param apiType                              Target API type for this operation
+ * @param isRestricted                         The target restriction
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a apiType incorrect
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support changing API restrictions or the device does not support
+ *                                                 the feature that api restrictions are being set for (E.G. Enabling/disabling auto 
+ *                                                 boosted clocks is not supported by the device)
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
+
+/**
+ * @}
+ */
+ 
+/** @addtogroup nvmlAccountingStats
+ *  @{
+ */
+
+/**
+ * Enables or disables per process accounting.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @note This setting is not persistent and will default to disabled after driver unloads.
+ *       Enable persistence mode to be sure the setting doesn't switch off to disabled.
+ * 
+ * @note Enabling accounting mode has no negative impact on the GPU performance.
+ *
+ * @note Disabling accounting clears all accounting pids information.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceClearAccountingPids
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target accounting mode
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the new mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a mode are invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Clears accounting information about all processes that have already terminated.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if accounting information has been cleared 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device are invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup NvLink NvLink Methods
+ * This chapter describes methods that NVML can perform on NVLINK enabled devices.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieves the state of the device's NvLink for the link specified
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param isActive                             \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
+ *                                             the link is active and NVML_FEATURE_DISABLED indicates it 
+ *                                             is inactive
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a isActive has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a isActive is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the version of the device's NvLink for the link specified
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param version                              Requested NvLink version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a version is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
+
+/**
+ * Retrieves the requested capability from the device's NvLink for the link specified
+ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
+ * The return value should be treated as a boolean.
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param capability                           Specifies the \a nvmlNvLinkCapability_t to be queried
+ * @param capResult                            A boolean for the queried capability indicating that feature is available
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a capResult has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a capability is invalid or \a capResult is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult); 
+
+/**
+ * Retrieves the PCI information for the remote node on a NvLink link 
+ * Note: pciSubSystemId is not filled in this function and is indeterminate
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param pci                                  \a nvmlPciInfo_t of the remote node for the specified link                            
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pci has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a pci is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the specified error counter value
+ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the NvLink counter to be queried
+ * @param counterValue                         Returned counter value
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a counter has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
+                                                     nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
+
+/**
+ * Resets all error counters to zero
+ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the reset is successful
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
+
+/**
+ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition.  Performs a reset
+ * of the counters if the reset parameter is non-zero.
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              Specifies the counter that should be set (0 or 1).
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to set
+ * @param reset                                Resets the counters on set if non-zero
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the control has been set successfully
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+                                                           nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
+
+/**
+ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              Specifies the counter that should be set (0 or 1).
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to place information
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the control has been set successfully
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+                                                           nvmlNvLinkUtilizationControl_t *control);
+
+
+/**
+ * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
+ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
+ *  before reading the utilization counters as they have no default state
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the counter that should be read (0 or 1).
+ * @param rxcounter                            Receive counter return value
+ * @param txcounter                            Transmit counter return value
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a rxcounter and \a txcounter have been successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, 
+                                                           unsigned long long *rxcounter, unsigned long long *txcounter);
+
+/**
+ * Freeze the NVLINK utilization counters 
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the counter that should be frozen (0 or 1).
+ * @param freeze                               NVML_FEATURE_ENABLED = freeze the receive and transmit counters
+ *                                             NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully frozen or unfrozen
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, \a counter, or \a freeze is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, 
+                                            unsigned int counter, nvmlEnableState_t freeze);
+
+/**
+ * Reset the NVLINK utilization counters 
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be reset
+ * @param counter                              Specifies the counter that should be reset (0 or 1)
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlEvents Event Handling Methods
+ * This chapter describes methods that NVML can perform against each device to register and wait for 
+ * some event to occur.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Create an empty set of events.
+ * Event set should be freed by \ref nvmlEventSetFree
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * @param set                                  Reference in which to return the event handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a set is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
+
+/**
+ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
+ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
+ *
+ * For Linux only.
+ *
+ * \b IMPORTANT: Operations on \a set are not thread safe
+ *
+ * This call starts recording of events on specific device.
+ * All events that occurred before this call are not recorded.
+ * Checking if some event occurred can be done with \ref nvmlEventSetWait
+ *
+ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
+ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
+ *     are registered in that case.
+ *
+ * @param device                               The identifier of the target device
+ * @param eventTypes                           Bitmask of \ref nvmlEventType to record
+ * @param set                                  Set to which add new event types
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventTypes is invalid or \a set is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform does not support this feature or some of requested event types
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceGetSupportedEventTypes
+ * @see nvmlEventSetWait
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
+
+/**
+ * Returns information about events supported on device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
+ *
+ * @param device                               The identifier of the target device
+ * @param eventTypes                           Reference in which to return bitmask of supported events
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the eventTypes has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventType is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
+
+/**
+ * Waits on events and delivers events
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * If some events are ready to be delivered at the time of the call, function returns immediately.
+ * If there are no events ready to be delivered, function sleeps till event arrives 
+ * but not longer than specified timeout. This function in certain conditions can return before
+ * specified timeout passes (e.g. when interrupt arrives)
+ * 
+ * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple
+ * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all
+ * xid error events.
+ * 
+ * @param set                                  Reference to set of events to wait on
+ * @param data                                 Reference in which to return event data
+ * @param timeoutms                            Maximum amount of wait time in milliseconds for registered event
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the data has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a data is NULL
+ *         - \ref NVML_ERROR_TIMEOUT           if no event arrived in specified timeout or interrupt arrived
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if a GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
+
+/**
+ * Releases events in the set
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param set                                  Reference to events to be released 
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been successfully released
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlZPI Drain states 
+ * This chapter describes methods that NVML can perform against each device to control their drain state
+ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
+ * power on/off GPUs, enable robust reset scenarios, etc.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Modify the drain state of a GPU.  This method forces a GPU to no longer accept new incoming requests.
+ * Any new NVML process will see a gap in the enumeration where this GPU should exist as any call to that
+ * GPU outside of the drain state APIs will fail.
+ * Must be called as administrator.
+ * For Linux only.
+ * 
+ * For newer than Maxwell &tm; fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param nvmlIndex                            The ID of the target device
+ * @param newState                             The drain state that should be entered, see \ref nvmlEnableState_t
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a newState is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (unsigned int nvmlIndex, nvmlEnableState_t newState);
+
+/**
+ * Query the drain state of a GPU.  This method is used to check if a GPU is in a currently draining
+ * state.
+ * For Linux only.
+ * 
+ * For newer than Maxwell &tm; fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param nvmlIndex                            The ID of the target device
+ * @param currentState                         The current drain state for this GPU, see \ref nvmlEnableState_t
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a currentState is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (unsigned int nvmlIndex, nvmlEnableState_t *currentState);
+
+/**
+ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
+ * as long as no other processes are attached. If other processes are attached, this call will return
+ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
+ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
+ * to initiate the draining state is if that process was using, and is still using, a GPU before the 
+ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
+ * prior to this call.
+ *
+ * For long-running NVML processes please note that this will change the enumeration of current GPUs.
+ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
+ * Also, device handles after the removed GPU will not be valid and must be re-established.
+ * Must be run as administrator. 
+ * For Linux only.
+ *
+ * For newer than Maxwell &tm; fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param nvmlIndex                            The ID of the target device
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_IN_USE            if the device is still in use and cannot be removed
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (unsigned int nvmlIndex);
+
+/**
+ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
+ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.  
+ * If all are zeroes then the entire PCI tree will be searched.  Please note that for long-running NVML processes
+ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
+ *
+ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
+ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
+ *
+ * Must be run as administrator.
+ * For Linux only.
+ * 
+ * For newer than Maxwell &tm; fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo                              The PCI tree to be searched.  Only the domain, bus, and device
+ *                                             fields are used in this call.
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pciInfo is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the operating system does not support this feature
+ *         - \ref NVML_ERROR_OPERATING_SYSTEM  if the operating system is denying this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
+
+/** @} */
+
+/**
+ * NVML API versioning support
+ */
+#if defined(__NVML_API_VERSION_INTERNAL)
+#undef nvmlDeviceGetPciInfo
+#undef nvmlDeviceGetCount
+#undef nvmlDeviceGetHandleByIndex
+#undef nvmlDeviceGetHandleByPciBusId
+#undef nvmlInit
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From 67d8c89b6a0adec6641c6f8152eba49717547b76 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Wed, 14 Sep 2016 14:17:48 -0400
Subject: [PATCH 058/150] Fix the size of data copied from device.

This seems to have a slight performance bonus.

On GeForce GTX 970 goes from 1.528GH/s to 1.576GH/s.

Closes #98.
---
 cudevice.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudevice.go b/cudevice.go
index 78963dd..7be306e 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -253,7 +253,7 @@ func (d *Device) runCuDevice() error {
 
 		decredHashNonce(gridx, blockx, throughput, startNonce, nonceResultsD, targetHigh)
 
-		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, d.cuInSize*4)
+		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, d.cuInSize)
 
 		numResults := nonceResultsHSlice[0]
 		for i, result := range nonceResultsHSlice[1 : 1+numResults] {

From ffbad25076c897d5fc3408115651c5ee59d7160c Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Fri, 16 Sep 2016 09:39:23 -0500
Subject: [PATCH 059/150] implement amdgpu sysfs support to fetch fan and
 temperature information (#100)

---
 cldevice.go | 129 +++++++++++++++++++++++++++++++++++++++++++++++++---
 device.go   |   6 +++
 2 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/cldevice.go b/cldevice.go
index 7183c14..4592c2d 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -3,9 +3,15 @@
 package main
 
 import (
+	"bufio"
 	"fmt"
+	"io/ioutil"
 	"math"
 	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync/atomic"
 	"time"
 	"unsafe"
 
@@ -14,6 +20,97 @@ import (
 	"github.com/decred/gominer/work"
 )
 
+// If the device order and OpenCL index are ever not the same then we can
+// implement topology finding code:
+// https://github.com/Oblomov/clinfo/blob/master/src/clinfo.c#L1061-L1126
+func determineDeviceKind(index int, deviceName string) string {
+	deviceKind := "unknown"
+
+	switch runtime.GOOS {
+	case "linux":
+		// check if the amdgpu driver is loaded
+		if _, err := os.Stat("/sys/module/amdgpu"); err == nil {
+			// make sure a sysfs entry exists for the index of this device
+			if _, err := os.Stat("/sys/class/drm/card" + strconv.Itoa(index)); err == nil {
+				deviceKind = "amdgpu"
+			}
+		}
+		break
+	}
+
+	return deviceKind
+}
+
+func deviceInfoAMDGPU(index int) (uint32, uint32) {
+	basePath := "/sys/class/drm/card_I_/device/hwmon/"
+	basePath = strings.Replace(basePath, "_I_", strconv.Itoa(index), 1)
+	fanPercent := uint32(0)
+	hwmonPath := basePath + "_HWMON_/"
+	hwmonName := ""
+	temperature := uint32(0)
+
+	files, err := ioutil.ReadDir(basePath)
+	if err != nil {
+		minrLog.Errorf("unable to read AMDGPU sysfs dir: %v", err)
+		return fanPercent, temperature
+	}
+
+	for _, f := range files {
+		// we should only find one entry but the API may not be stable
+		if strings.Contains(f.Name(), "hwmon") {
+			hwmonName = f.Name()
+		}
+	}
+
+	if hwmonName == "" {
+		minrLog.Errorf("unable to determine AMDGPU hwmon path")
+		return fanPercent, temperature
+	}
+
+	hwmonPath = strings.Replace(hwmonPath, "_HWMON_", hwmonName, 1)
+	pwmMax := uint32(255) // could read this from pwm1_max but it seems to be a constant
+	tempDivisor := uint32(1000)
+
+	fanPercent = deviceInfoReadSysfsEntry(hwmonPath + "pwm1")
+	fanPercentFloat := float64(fanPercent) / float64(pwmMax) * float64(100)
+	fanPercent = uint32(fanPercentFloat)
+	temperature = deviceInfoReadSysfsEntry(hwmonPath+"temp1_input") / tempDivisor
+
+	return fanPercent, temperature
+}
+
+func deviceInfoReadSysfsEntry(path string) uint32 {
+	res := uint32(0)
+	dataRaw := ""
+
+	f, err := os.Open(path)
+	if err != nil {
+		if err != nil {
+			minrLog.Errorf("unable to open %v", path)
+			return res
+		}
+	}
+	defer f.Close()
+
+	r := bufio.NewScanner(f)
+	for r.Scan() {
+		dataRaw = string(r.Bytes())
+	}
+	if err := r.Err(); err != nil {
+		return res
+	}
+
+	dataInt, err := strconv.Atoi(dataRaw)
+	if err != nil {
+		minrLog.Errorf("unable to convert to int %v", err)
+		return res
+	}
+
+	res = uint32(dataInt)
+
+	return res
+}
+
 func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
 	var platformID cl.CL_platform_id
 	platformIDs, err := getCLPlatforms()
@@ -86,13 +183,15 @@ func ListDevices() {
 func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
 	workDone chan []byte) (*Device, error) {
 	d := &Device{
-		index:      index,
-		platformID: platformID,
-		deviceID:   deviceID,
-		deviceName: getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
-		quit:       make(chan struct{}),
-		newWork:    make(chan *work.Work, 5),
-		workDone:   workDone,
+		index:       index,
+		platformID:  platformID,
+		deviceID:    deviceID,
+		deviceName:  getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
+		quit:        make(chan struct{}),
+		newWork:     make(chan *work.Work, 5),
+		workDone:    workDone,
+		fanPercent:  0,
+		temperature: 0,
 	}
 
 	var status cl.CL_int
@@ -226,6 +325,22 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		d.index, globalWorkSize, intensity)
 	d.workSize = globalWorkSize
 
+	// Determine the device/driver kind
+	d.kind = determineDeviceKind(d.index, d.deviceName)
+
+	switch d.kind {
+	case "amdgpu":
+		fanPercent, temperature := deviceInfoAMDGPU(d.index)
+		// Newer cards will idle with the fan off so just check if we got
+		// a good temperature reading
+		if temperature != 0 {
+			atomic.StoreUint32(&d.fanPercent, fanPercent)
+			atomic.StoreUint32(&d.temperature, temperature)
+			d.fanTempActive = true
+		}
+		break
+	}
+
 	return d, nil
 }
 
diff --git a/device.go b/device.go
index dcd0acd..aa350c3 100644
--- a/device.go
+++ b/device.go
@@ -346,10 +346,16 @@ func (d *Device) UpdateFanTemp() {
 	defer d.Unlock()
 	if d.fanTempActive {
 		switch d.kind {
+		case "amdgpu":
+			fanPercent, temperature := deviceInfoAMDGPU(d.index)
+			atomic.StoreUint32(&d.fanPercent, fanPercent)
+			atomic.StoreUint32(&d.temperature, temperature)
+			break
 		case "nvidia":
 			fanPercent, temperature := deviceInfoNVIDIA(d.index)
 			atomic.StoreUint32(&d.fanPercent, fanPercent)
 			atomic.StoreUint32(&d.temperature, temperature)
+			break
 		}
 	}
 }

From 868e371241fbcf82a8d94a83d7982cdf9e9c0be5 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Fri, 16 Sep 2016 10:32:39 -0500
Subject: [PATCH 060/150] fix using a device on the second OpenCL platform
 (#102)

---
 miner.go | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/miner.go b/miner.go
index 2732d12..d801a2f 100644
--- a/miner.go
+++ b/miner.go
@@ -45,15 +45,15 @@ func NewMiner() (*Miner, error) {
 		m.pool = s
 	}
 
+	deviceListIndex := 0
+	deviceListEnabledCount := 0
+
 	if cfg.UseCuda {
 		CUdeviceIDs, err := getCUInfo()
 		if err != nil {
 			return nil, err
 		}
 
-		deviceListIndex := 0
-		deviceListEnabledCount := 0
-
 		// XXX Can probably combine these bits with the opencl ones once
 		// I decide what to do about the types.
 
@@ -81,20 +81,12 @@ func NewMiner() (*Miner, error) {
 			}
 			deviceListIndex++
 		}
-
-		if deviceListEnabledCount == 0 {
-			return nil, fmt.Errorf("No devices started")
-		}
-
 	} else {
 		platformIDs, err := getCLPlatforms()
 		if err != nil {
 			return nil, fmt.Errorf("Could not get CL platforms: %v", err)
 		}
 
-		deviceListIndex := 0
-		deviceListEnabledCount := 0
-
 		for p := range platformIDs {
 			platformID := platformIDs[p]
 			CLdeviceIDs, err := getCLDevices(platformID)
@@ -125,13 +117,13 @@ func NewMiner() (*Miner, error) {
 				}
 				deviceListIndex++
 			}
-
-			if deviceListEnabledCount == 0 {
-				return nil, fmt.Errorf("No devices started")
-			}
 		}
 	}
 
+	if deviceListEnabledCount == 0 {
+		return nil, fmt.Errorf("No devices started")
+	}
+
 	m.started = uint32(time.Now().Unix())
 
 	return m, nil

From d6ff92c3d517a8716a4b42c17882bafc1824cfa7 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 20 Sep 2016 11:02:58 -0500
Subject: [PATCH 061/150] use a slice of submitIDs instead of a single submitID
 (#103)

---
 getwork.go         |  2 --
 stratum/stratum.go | 41 +++++++++++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/getwork.go b/getwork.go
index c916ddf..438437c 100644
--- a/getwork.go
+++ b/getwork.go
@@ -316,7 +316,5 @@ func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
 		return false, err
 	}
 
-	pool.Submitted = true
-
 	return true, nil
 }
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 78db377..53e47d8 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -53,10 +53,9 @@ type Stratum struct {
 	ID        uint64
 	authID    uint64
 	subID     uint64
-	submitID  uint64
+	submitIDs []uint64
 	Diff      float64
 	Target    *big.Int
-	Submitted bool
 	PoolWork  NotifyWork
 }
 
@@ -154,6 +153,25 @@ type Submit struct {
 // errJsonType is an error for json that we do not expect.
 var errJsonType = errors.New("Unexpected type in json.")
 
+func sliceContains(s []uint64, e uint64) bool {
+	for _, a := range s {
+		if a == e {
+			return true
+		}
+	}
+	return false
+}
+
+func sliceRemove(s []uint64, e uint64) []uint64 {
+	for i, a := range s {
+		if a == e {
+			return append(s[:i], s[i+1:]...)
+		}
+	}
+
+	return s
+}
+
 // StratumConn starts the initial connection to a stratum pool and sets defaults
 // in the pool object.
 func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string) (*Stratum, error) {
@@ -309,7 +327,7 @@ func (s *Stratum) handleBasicReply(resp interface{}) {
 			log.Error("Auth failure.")
 		}
 	}
-	if aResp.ID == s.submitID {
+	if sliceContains(s.submitIDs, aResp.ID.(uint64)) {
 		if aResp.Result {
 			atomic.AddUint64(&s.ValidShares, 1)
 			log.Debug("Share accepted")
@@ -317,7 +335,7 @@ func (s *Stratum) handleBasicReply(resp interface{}) {
 			atomic.AddUint64(&s.InvalidShares, 1)
 			log.Error("Share rejected: ", aResp.Error.ErrStr)
 		}
-		s.Submitted = false
+		s.submitIDs = sliceRemove(s.submitIDs, aResp.ID.(uint64))
 	}
 }
 
@@ -425,7 +443,7 @@ func (s *Stratum) Auth() error {
 		return errJsonType
 	}
 	s.authID = id
-	s.ID += 1
+	s.ID++
 	log.Tracef("%v", msg)
 	m, err := json.Marshal(msg)
 	if err != nil {
@@ -467,7 +485,7 @@ func (s *Stratum) Subscribe() error {
 	return nil
 }
 
-// Unmarshal provides a json umnarshaler for the commands.
+// Unmarshal provides a json unmarshaler for the commands.
 // I'm sure a lot of this can be generalized but the json we deal with
 // is pretty yucky.
 func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
@@ -603,7 +621,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		resp.ExtraNonce2Length = resi[2].(float64)
 		return resp, nil
 	}
-	if id == s.submitID && s.Submitted {
+	if sliceContains(s.submitIDs, id) {
 		var (
 			objmap      map[string]json.RawMessage
 			id          uint64
@@ -953,16 +971,15 @@ func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 		return sub, err
 	}
 
-	s.ID++
-	sub.ID = s.ID
-	s.submitID = s.ID
-	s.Submitted = true
-
 	latestWorkTs := atomic.LoadUint32(&s.latestJobTime)
 	if uint32(submittedHeader.Timestamp.Unix()) != latestWorkTs {
 		return sub, ErrStratumStaleWork
 	}
 
+	s.ID++
+	sub.ID = s.ID
+	s.submitIDs = append(s.submitIDs, s.ID)
+
 	// The timestamp string should be:
 	//
 	//   timestampStr := fmt.Sprintf("%08x",

From ddaf07f52fc64916be143d23858774bc184e3e9c Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 16 Sep 2016 15:03:25 -0400
Subject: [PATCH 062/150] Use build flags to only build cuda or opencl versions
 of gominer.

-tags 'opencl' or -tags 'cuda'

Add type (cuda or opencl) to version string.

Fixes #83
---
 GNUmakefile  |   2 +-
 README.md    |  19 ++++--
 calibrate.go |   2 +
 cldevice.go  | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 config.go    |  13 +---
 cudevice.go  | 117 ++++++++++++++++++++++++++++++--
 device.go    | 166 ++-------------------------------------------
 main.go      |   2 +-
 miner.go     |  76 +--------------------
 9 files changed, 326 insertions(+), 257 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 014403d..d161297 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -18,7 +18,7 @@ obj/cuda.a: obj/blake.o obj/decred.o
 	$(AR) rvs obj/cuda.a obj/blake.o obj/decred.o
 
 build: obj/cuda.a
-	go build
+	go build -tags 'cuda'
 
 install: obj/cuda.a
 	go install
diff --git a/README.md b/README.md
index 089d970..f831d60 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,9 @@
 
 ## Installation
 
-You need to have the OpenCL and CUDA development libraries
-installed. You only need the runtime and drives for the one you plan
+You need to have the OpenCL or CUDA development libraries
+installed (depending on which version of gominer you would like to
+build) . You also need the runtime and drives for the one you plan
 on running (CUDA for nvidia, OpenCL for anything) To download and
 build gominer, run:
 
@@ -14,7 +15,17 @@ cd $GOPATH/src/github.com/decred
 git clone  https://github.com/decred/gominer.git
 cd gominer
 glide i
-go install $(glide nv)
+```
+
+for opencl:
+```
+go install -tags 'opencl'
+```
+
+for cuda:
+```
+make
+go install -tags 'cuda'
 ```
 
 On Ubuntu 16.04 you can install the necessary OpenCL packages (for
@@ -25,7 +36,7 @@ sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
 Other graphics cards will need different libraries.  We have built
-successfully on Ubuntu 16.04 with go1.6.2, g++ 5.4.0 and
+successfully on Ubuntu 16.04 with go1.6.2, go1.7.1, g++ 5.4.0 and
 beignet-dev 1.1.1-2 although other combinations should work as well.
 
 ## Running
diff --git a/calibrate.go b/calibrate.go
index 184d319..e6ff001 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -1,5 +1,7 @@
 // Copyright (c) 2016 The Decred developers.
 
+// +build opencl,!cuda
+
 package main
 
 import (
diff --git a/cldevice.go b/cldevice.go
index 4592c2d..f457e46 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -1,16 +1,21 @@
 // Copyright (c) 2016 The Decred developers.
 
+// +build opencl,!cuda
+
 package main
 
 import (
 	"bufio"
+	"bytes"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"math"
 	"os"
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"time"
 	"unsafe"
@@ -20,6 +25,107 @@ import (
 	"github.com/decred/gominer/work"
 )
 
+// Return the GPU library in use.
+func gpuLib() string {
+	return "OpenCL"
+}
+
+const (
+	outputBufferSize = cl.CL_size_t(64)
+	localWorksize    = 64
+	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
+)
+
+var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
+
+func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
+	var programBuffer [1][]byte
+	var programSize [1]cl.CL_size_t
+
+	// Read each program file and place content into buffer array.
+	programHandle, err := os.Open(filename)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer programHandle.Close()
+
+	buf := bytes.NewBuffer(nil)
+	_, err = io.Copy(buf, programHandle)
+	if err != nil {
+		return nil, nil, err
+	}
+	str := string(buf.Bytes())
+	programFinal := []byte(str)
+
+	programSize[0] = cl.CL_size_t(len(programFinal))
+	programBuffer[0] = make([]byte, programSize[0])
+	for i := range programFinal {
+		programBuffer[0][i] = programFinal[i]
+	}
+
+	return programBuffer[:], programSize[:], nil
+}
+
+func clError(status cl.CL_int, f string) error {
+	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
+		return fmt.Errorf("returned unknown error")
+	}
+
+	return fmt.Errorf("%s returned error %s (%d)", f,
+		cl.ERROR_CODES_STRINGS[-status], status)
+}
+
+type Device struct {
+	// The following variables must only be used atomically.
+	fanPercent  uint32
+	temperature uint32
+
+	sync.Mutex
+	index int
+	cuda  bool
+
+	// Items for OpenCL device
+	platformID    cl.CL_platform_id
+	deviceID      cl.CL_device_id
+	deviceName    string
+	context       cl.CL_context
+	queue         cl.CL_command_queue
+	outputBuffer  cl.CL_mem
+	program       cl.CL_program
+	kernel        cl.CL_kernel
+	fanTempActive bool
+	kind          string
+
+	//cuInput        cu.DevicePtr
+	cuInSize       int64
+	cuOutputBuffer []float64
+
+	workSize uint32
+
+	// extraNonce is the device extraNonce, where the first
+	// byte is the device ID (supporting up to 255 devices)
+	// while the last 3 bytes is the extraNonce value. If
+	// the extraNonce goes through all 0x??FFFFFF values,
+	// it will reset to 0x??000000.
+	extraNonce    uint32
+	currentWorkID uint32
+
+	midstate  [8]uint32
+	lastBlock [16]uint32
+
+	work     work.Work
+	newWork  chan *work.Work
+	workDone chan []byte
+	hasWork  bool
+
+	started          uint32
+	allDiffOneShares uint64
+	validShares      uint64
+	invalidShares    uint64
+
+	quit chan struct{}
+}
+
 // If the device order and OpenCL index are ever not the same then we can
 // implement topology finding code:
 // https://github.com/Oblomov/clinfo/blob/master/src/clinfo.c#L1061-L1126
@@ -41,7 +147,7 @@ func determineDeviceKind(index int, deviceName string) string {
 	return deviceKind
 }
 
-func deviceInfoAMDGPU(index int) (uint32, uint32) {
+func deviceInfo(index int) (uint32, uint32) {
 	basePath := "/sys/class/drm/card_I_/device/hwmon/"
 	basePath = strings.Replace(basePath, "_I_", strconv.Itoa(index), 1)
 	fanPercent := uint32(0)
@@ -330,7 +436,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 
 	switch d.kind {
 	case "amdgpu":
-		fanPercent, temperature := deviceInfoAMDGPU(d.index)
+		fanPercent, temperature := deviceInfo(d.index)
 		// Newer cards will idle with the fan off so just check if we got
 		// a good temperature reading
 		if temperature != 0 {
@@ -459,3 +565,79 @@ func (d *Device) runDevice() error {
 			elapsedTime)
 	}
 }
+
+func newMinerDevs(m *Miner) (*Miner, int, error) {
+	deviceListIndex := 0
+	deviceListEnabledCount := 0
+
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		return nil, 0, fmt.Errorf("Could not get CL platforms: %v", err)
+	}
+
+	for p := range platformIDs {
+		platformID := platformIDs[p]
+		CLdeviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			return nil, 0, fmt.Errorf("Could not get CL devices for platform: %v", err)
+		}
+
+		for _, CLdeviceID := range CLdeviceIDs {
+			miningAllowed := false
+
+			// Enforce device restrictions if they exist
+			if len(cfg.DeviceIDs) > 0 {
+				for _, i := range cfg.DeviceIDs {
+					if deviceListIndex == i {
+						miningAllowed = true
+					}
+				}
+			} else {
+				miningAllowed = true
+			}
+			if miningAllowed {
+				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
+				deviceListEnabledCount++
+				m.devices = append(m.devices, newDevice)
+				if err != nil {
+					return nil, 0, err
+				}
+			}
+			deviceListIndex++
+		}
+	}
+	return m, deviceListEnabledCount, nil
+
+}
+
+func getDeviceInfo(id cl.CL_device_id,
+	name cl.CL_device_info,
+	str string) string {
+
+	var errNum cl.CL_int
+	var paramValueSize cl.CL_size_t
+
+	errNum = cl.CLGetDeviceInfo(id, name, 0, nil, &paramValueSize)
+
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	var info interface{}
+	errNum = cl.CLGetDeviceInfo(id, name, paramValueSize, &info, nil)
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	strinfo := fmt.Sprintf("%v", info)
+
+	return strinfo
+}
+
+func (d *Device) Release() {
+	cl.CLReleaseKernel(d.kernel)
+	cl.CLReleaseProgram(d.program)
+	cl.CLReleaseCommandQueue(d.queue)
+	cl.CLReleaseMemObject(d.outputBuffer)
+	cl.CLReleaseContext(d.context)
+}
diff --git a/config.go b/config.go
index e417a70..3cd8fe5 100644
--- a/config.go
+++ b/config.go
@@ -39,9 +39,8 @@ var (
 )
 
 type config struct {
-	ListDevices   bool `short:"l" long:"listdevices" description:"List number of devices."`
-	ListCuDevices bool `long:"listcudadevices" description:"List number of CUDA devices."`
-	ShowVersion   bool `short:"V" long:"version" description:"Display version information and exit"`
+	ListDevices bool `short:"l" long:"listdevices" description:"List number of devices."`
+	ShowVersion bool `short:"V" long:"version" description:"Display version information and exit"`
 
 	// Config / log options
 	ConfigFile string `short:"C" long:"configfile" description:"Path to configuration file"`
@@ -70,7 +69,6 @@ type config struct {
 	SimNet        bool `long:"simnet" description:"Connect to the simulation test network"`
 	TLSSkipVerify bool `long:"skipverify" description:"Do not verify tls certificates (not recommended!)"`
 
-	UseCuda           bool   `short:"U" long:"cuda" description:"Use CUDA if GPU supports it"`
 	Autocalibrate     string `short:"A" long:"autocalibrate" description:"Time target in milliseconds to spend executing hashes on the device during each iteration. Single global value or a comma separated list."`
 	AutocalibrateInts []int
 	Devices           string `short:"D" long:"devices" description:"Single device ID or a comma separated list of device IDs to use."`
@@ -266,13 +264,8 @@ func loadConfig() (*config, []string, error) {
 		os.Exit(0)
 	}
 
-	if preCfg.ListCuDevices {
-		ListCuDevices()
-		os.Exit(0)
-	}
-
 	if preCfg.ShowVersion {
-		fmt.Println(appName, "version", version())
+		fmt.Println(appName, gpuLib(), "version", version())
 		os.Exit(0)
 	}
 
diff --git a/cudevice.go b/cudevice.go
index 7be306e..f9399a7 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -1,5 +1,7 @@
 // Copyright (c) 2016 The Decred developers.
 
+// +build cuda,!opencl
+
 package main
 
 /*
@@ -14,6 +16,7 @@ import (
 	"fmt"
 	"reflect"
 	"runtime"
+	"sync"
 	"sync/atomic"
 	"time"
 	"unsafe"
@@ -31,6 +34,62 @@ const (
 	blockx          = threadsPerBlock
 )
 
+// Return the GPU library in use.
+func gpuLib() string {
+	return "Cuda"
+}
+
+const (
+	localWorksize      = 64
+	cuOutputBufferSize = 64
+)
+
+type Device struct {
+	// The following variables must only be used atomically.
+	fanPercent  uint32
+	temperature uint32
+
+	sync.Mutex
+	index int
+	cuda  bool
+
+	deviceName    string
+	fanTempActive bool
+	kind          string
+
+	// Items for CUDA device
+	cuDeviceID cu.Device
+	cuContext  cu.Context
+	//cuInput        cu.DevicePtr
+	cuInSize       int64
+	cuOutputBuffer []float64
+
+	workSize uint32
+
+	// extraNonce is the device extraNonce, where the first
+	// byte is the device ID (supporting up to 255 devices)
+	// while the last 3 bytes is the extraNonce value. If
+	// the extraNonce goes through all 0x??FFFFFF values,
+	// it will reset to 0x??000000.
+	extraNonce    uint32
+	currentWorkID uint32
+
+	midstate  [8]uint32
+	lastBlock [16]uint32
+
+	work     work.Work
+	newWork  chan *work.Work
+	workDone chan []byte
+	hasWork  bool
+
+	started          uint32
+	allDiffOneShares uint64
+	validShares      uint64
+	invalidShares    uint64
+
+	quit chan struct{}
+}
+
 func decredCPUSetBlock52(input *[192]byte) {
 	if input == nil {
 		panic("input is nil")
@@ -43,7 +102,7 @@ func decredHashNonce(gridx, blockx, threads uint32, startNonce uint32, nonceResu
 		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
 }
 
-func deviceInfoNVIDIA(index int) (uint32, uint32) {
+func deviceInfo(index int) (uint32, uint32) {
 	fanPercent := uint32(0)
 	temperature := uint32(0)
 
@@ -76,7 +135,7 @@ func deviceInfoNVIDIA(index int) (uint32, uint32) {
 	return fanPercent, temperature
 }
 
-func getCUInfo() ([]cu.Device, error) {
+func getInfo() ([]cu.Device, error) {
 	cu.Init(0)
 	ids := cu.DeviceGetCount()
 	minrLog.Infof("%v GPUs", ids)
@@ -119,8 +178,8 @@ func getCUDevices() ([]cu.Device, error) {
 	return devices, nil
 }
 
-// ListCuDevices prints a list of CUDA capable GPUs present.
-func ListCuDevices() {
+// ListDevices prints a list of CUDA capable GPUs present.
+func ListDevices() {
 	// CUDA devices
 	// Because mumux3/3/cuda/cu likes to panic instead of error.
 	defer func() {
@@ -152,7 +211,7 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 
 	d.cuInSize = 21
 
-	fanPercent, temperature := deviceInfoNVIDIA(d.index)
+	fanPercent, temperature := deviceInfo(d.index)
 	// Newer cards will idle with the fan off so just check if we got
 	// a good temperature reading
 	if temperature != 0 {
@@ -168,7 +227,7 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 	return d, nil
 }
 
-func (d *Device) runCuDevice() error {
+func (d *Device) runDevice() error {
 	// Bump the extraNonce for the device it's running on
 	// when you begin mining. This ensures each GPU is doing
 	// different work. If the extraNonce has already been
@@ -286,3 +345,49 @@ func minUint32(a, b uint32) uint32 {
 		return b
 	}
 }
+
+func newMinerDevs(m *Miner) (*Miner, int, error) {
+	deviceListIndex := 0
+	deviceListEnabledCount := 0
+
+	CUdeviceIDs, err := getInfo()
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// XXX Can probably combine these bits with the opencl ones once
+	// I decide what to do about the types.
+
+	for _, CUDeviceID := range CUdeviceIDs {
+		miningAllowed := false
+
+		// Enforce device restrictions if they exist
+		if len(cfg.DeviceIDs) > 0 {
+			for _, i := range cfg.DeviceIDs {
+				if deviceListIndex == i {
+					miningAllowed = true
+				}
+			}
+		} else {
+			miningAllowed = true
+		}
+
+		if miningAllowed {
+			newDevice, err := NewCuDevice(deviceListIndex, deviceListEnabledCount, CUDeviceID, m.workDone)
+			deviceListEnabledCount++
+			m.devices = append(m.devices, newDevice)
+			if err != nil {
+				return nil, 0, err
+			}
+		}
+		deviceListIndex++
+	}
+
+	return m, deviceListEnabledCount, nil
+}
+
+func (d *Device) Release() {
+	d.cuContext.SetCurrent()
+	//d.cuInput.Free()
+	cu.CtxDestroy(&d.cuContext)
+}
diff --git a/device.go b/device.go
index aa350c3..b54cbae 100644
--- a/device.go
+++ b/device.go
@@ -3,146 +3,23 @@
 package main
 
 import (
-	"bytes"
 	"encoding/binary"
 	"encoding/hex"
-	"fmt"
-	"io"
 	"math/big"
-	"os"
-	"sync"
 	"sync/atomic"
 	"time"
-	"unsafe"
-
-	"github.com/mumax/3/cuda/cu"
 
 	"github.com/decred/dcrd/blockchain"
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 
 	"github.com/decred/gominer/blake256"
-	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
-const (
-	outputBufferSize   = cl.CL_size_t(64)
-	localWorksize      = 64
-	uint32Size         = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
-	cuOutputBufferSize = 64
-)
-
 var chainParams = &chaincfg.MainNetParams
 
-var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
-
-func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
-	var programBuffer [1][]byte
-	var programSize [1]cl.CL_size_t
-
-	// Read each program file and place content into buffer array.
-	programHandle, err := os.Open(filename)
-	if err != nil {
-		return nil, nil, err
-	}
-	defer programHandle.Close()
-
-	buf := bytes.NewBuffer(nil)
-	_, err = io.Copy(buf, programHandle)
-	if err != nil {
-		return nil, nil, err
-	}
-	str := string(buf.Bytes())
-	programFinal := []byte(str)
-
-	programSize[0] = cl.CL_size_t(len(programFinal))
-	programBuffer[0] = make([]byte, programSize[0])
-	for i := range programFinal {
-		programBuffer[0][i] = programFinal[i]
-	}
-
-	return programBuffer[:], programSize[:], nil
-}
-
-type Device struct {
-	// The following variables must only be used atomically.
-	fanPercent  uint32
-	temperature uint32
-
-	sync.Mutex
-	index int
-	cuda  bool
-
-	// Items for OpenCL device
-	platformID    cl.CL_platform_id
-	deviceID      cl.CL_device_id
-	deviceName    string
-	context       cl.CL_context
-	queue         cl.CL_command_queue
-	outputBuffer  cl.CL_mem
-	program       cl.CL_program
-	kernel        cl.CL_kernel
-	fanTempActive bool
-	kind          string
-
-	// Items for CUDA device
-	cuDeviceID cu.Device
-	cuContext  cu.Context
-	//cuInput        cu.DevicePtr
-	cuInSize       int64
-	cuOutputBuffer []float64
-
-	workSize uint32
-
-	// extraNonce is the device extraNonce, where the first
-	// byte is the device ID (supporting up to 255 devices)
-	// while the last 3 bytes is the extraNonce value. If
-	// the extraNonce goes through all 0x??FFFFFF values,
-	// it will reset to 0x??000000.
-	extraNonce    uint32
-	currentWorkID uint32
-
-	midstate  [8]uint32
-	lastBlock [16]uint32
-
-	work     work.Work
-	newWork  chan *work.Work
-	workDone chan []byte
-	hasWork  bool
-
-	started          uint32
-	allDiffOneShares uint64
-	validShares      uint64
-	invalidShares    uint64
-
-	quit chan struct{}
-}
-
-func clError(status cl.CL_int, f string) error {
-	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
-		return fmt.Errorf("returned unknown error")
-	}
-
-	return fmt.Errorf("%s returned error %s (%d)", f,
-		cl.ERROR_CODES_STRINGS[-status], status)
-}
-
-func (d *Device) Release() {
-	if d.cuda {
-		d.cuContext.SetCurrent()
-		//d.cuInput.Free()
-		cu.CtxDestroy(&d.cuContext)
-	} else {
-		cl.CLReleaseKernel(d.kernel)
-		cl.CLReleaseProgram(d.program)
-		cl.CLReleaseCommandQueue(d.queue)
-		cl.CLReleaseMemObject(d.outputBuffer)
-		cl.CLReleaseContext(d.context)
-	}
-}
-
 func (d *Device) updateCurrentWork() {
 	var w *work.Work
 	if d.hasWork {
@@ -191,12 +68,7 @@ func (d *Device) updateCurrentWork() {
 }
 
 func (d *Device) Run() {
-	var err error
-	if d.cuda {
-		err = d.runCuDevice()
-	} else {
-		err = d.runDevice()
-	}
+	err := d.runDevice()
 	if err != nil {
 		minrLog.Errorf("Error on device: %v", err)
 	}
@@ -281,30 +153,6 @@ func (d *Device) SetWork(w *work.Work) {
 	d.newWork <- w
 }
 
-func getDeviceInfo(id cl.CL_device_id,
-	name cl.CL_device_info,
-	str string) string {
-
-	var errNum cl.CL_int
-	var paramValueSize cl.CL_size_t
-
-	errNum = cl.CLGetDeviceInfo(id, name, 0, nil, &paramValueSize)
-
-	if errNum != cl.CL_SUCCESS {
-		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
-	}
-
-	var info interface{}
-	errNum = cl.CLGetDeviceInfo(id, name, paramValueSize, &info, nil)
-	if errNum != cl.CL_SUCCESS {
-		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
-	}
-
-	strinfo := fmt.Sprintf("%v", info)
-
-	return strinfo
-}
-
 func (d *Device) PrintStats() {
 	secondsElapsed := uint32(time.Now().Unix()) - d.started
 	if secondsElapsed == 0 {
@@ -345,14 +193,12 @@ func (d *Device) UpdateFanTemp() {
 	d.Lock()
 	defer d.Unlock()
 	if d.fanTempActive {
+		// For now amd and nvidia do more or less the same thing
+		// but could be split up later.  Anything else (Intel) just
+		// don't do anything.
 		switch d.kind {
-		case "amdgpu":
-			fanPercent, temperature := deviceInfoAMDGPU(d.index)
-			atomic.StoreUint32(&d.fanPercent, fanPercent)
-			atomic.StoreUint32(&d.temperature, temperature)
-			break
-		case "nvidia":
-			fanPercent, temperature := deviceInfoNVIDIA(d.index)
+		case "amdgpu", "nvidia":
+			fanPercent, temperature := deviceInfo(d.index)
 			atomic.StoreUint32(&d.fanPercent, fanPercent)
 			atomic.StoreUint32(&d.temperature, temperature)
 			break
diff --git a/main.go b/main.go
index 1e8b33b..35773cd 100644
--- a/main.go
+++ b/main.go
@@ -25,7 +25,7 @@ func gominerMain() error {
 	defer backendLog.Flush()
 
 	// Show version at startup.
-	mainLog.Infof("Version %s", version())
+	mainLog.Infof("Version %s %s", version(), gpuLib())
 
 	// Enable http profiling server if requested.
 	if cfg.Profile != "" {
diff --git a/miner.go b/miner.go
index d801a2f..05a9daa 100644
--- a/miner.go
+++ b/miner.go
@@ -45,79 +45,9 @@ func NewMiner() (*Miner, error) {
 		m.pool = s
 	}
 
-	deviceListIndex := 0
-	deviceListEnabledCount := 0
-
-	if cfg.UseCuda {
-		CUdeviceIDs, err := getCUInfo()
-		if err != nil {
-			return nil, err
-		}
-
-		// XXX Can probably combine these bits with the opencl ones once
-		// I decide what to do about the types.
-
-		for _, CUDeviceID := range CUdeviceIDs {
-			miningAllowed := false
-
-			// Enforce device restrictions if they exist
-			if len(cfg.DeviceIDs) > 0 {
-				for _, i := range cfg.DeviceIDs {
-					if deviceListIndex == i {
-						miningAllowed = true
-					}
-				}
-			} else {
-				miningAllowed = true
-			}
-
-			if miningAllowed {
-				newDevice, err := NewCuDevice(deviceListIndex, deviceListEnabledCount, CUDeviceID, m.workDone)
-				deviceListEnabledCount++
-				m.devices = append(m.devices, newDevice)
-				if err != nil {
-					return nil, err
-				}
-			}
-			deviceListIndex++
-		}
-	} else {
-		platformIDs, err := getCLPlatforms()
-		if err != nil {
-			return nil, fmt.Errorf("Could not get CL platforms: %v", err)
-		}
-
-		for p := range platformIDs {
-			platformID := platformIDs[p]
-			CLdeviceIDs, err := getCLDevices(platformID)
-			if err != nil {
-				return nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
-			}
-
-			for _, CLdeviceID := range CLdeviceIDs {
-				miningAllowed := false
-
-				// Enforce device restrictions if they exist
-				if len(cfg.DeviceIDs) > 0 {
-					for _, i := range cfg.DeviceIDs {
-						if deviceListIndex == i {
-							miningAllowed = true
-						}
-					}
-				} else {
-					miningAllowed = true
-				}
-				if miningAllowed {
-					newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
-					deviceListEnabledCount++
-					m.devices = append(m.devices, newDevice)
-					if err != nil {
-						return nil, err
-					}
-				}
-				deviceListIndex++
-			}
-		}
+	m, deviceListEnabledCount, err := newMinerDevs(m)
+	if err != nil {
+		return nil, err
 	}
 
 	if deviceListEnabledCount == 0 {

From b7634914b65ba7c78e8bfce0441b2edce06b6206 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 22 Sep 2016 10:56:53 -0500
Subject: [PATCH 063/150] implement ADL support to fetch fan/temperature
 information (#106)

---
 README.md            |   11 +-
 adl/adl.c            |  150 +++
 adl/adl.go           |   31 +
 adl/adl_defines.h    | 1984 ++++++++++++++++++++++++++++++
 adl/adl_functions.h  |  292 +++++
 adl/adl_sdk.h        |   29 +
 adl/adl_structures.h | 2769 ++++++++++++++++++++++++++++++++++++++++++
 calibrate.go         |    2 +-
 cladldevice.go       |  553 +++++++++
 cldevice.go          |   12 +-
 cudevice.go          |    6 +-
 device.go            |    4 +-
 12 files changed, 5828 insertions(+), 15 deletions(-)
 create mode 100644 adl/adl.c
 create mode 100644 adl/adl.go
 create mode 100644 adl/adl_defines.h
 create mode 100644 adl/adl_functions.h
 create mode 100644 adl/adl_sdk.h
 create mode 100644 adl/adl_structures.h
 create mode 100644 cladldevice.go

diff --git a/README.md b/README.md
index f831d60..f59bbfa 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 You need to have the OpenCL or CUDA development libraries
 installed (depending on which version of gominer you would like to
-build) . You also need the runtime and drives for the one you plan
+build) . You also need the runtime and drivers for the one you plan
 on running (CUDA for nvidia, OpenCL for anything) To download and
 build gominer, run:
 
@@ -17,12 +17,17 @@ cd gominer
 glide i
 ```
 
-for opencl:
+For OpenCL:
 ```
 go install -tags 'opencl'
 ```
 
-for cuda:
+For OpenCL with AMD Device Library (ADL) support:
+```
+go install -tags 'opencladl'
+```
+
+For CUDA with NVIDIA Management Library (NVML) support:
 ```
 make
 go install -tags 'cuda'
diff --git a/adl/adl.c b/adl/adl.c
new file mode 100644
index 0000000..03624d0
--- /dev/null
+++ b/adl/adl.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2012 Con Kolivas
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "adl_sdk.h"
+#include "adl_functions.h"
+
+#define MAX_GPUDEVICES 16
+
+// declarations in adl_functions.h for these are formatted for dynamic loading
+int ADL_Adapter_AdapterInfo_Get(LPAdapterInfo lpInfo, int iInputSize);
+int ADL_Adapter_ID_Get(int iAdapterIndex, int *lpAdapterID);
+int ADL_Adapter_NumberOfAdapters_Get(int *lpNumAdapters);
+int ADL_Main_Control_Create(ADL_MAIN_MALLOC_CALLBACK callback, int iEnumConnectedAdapters);
+int ADL_Overdrive5_FanSpeed_Get(int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
+int ADL_Overdrive5_Temperature_Get (int iAdapterIndex, int iThermalControllerIndex, ADLTemperature *lpTemperature);
+
+int getADLInfo(int deviceid, char field[64]);
+
+struct gpu_adapters {
+  int iAdapterIndex;
+  int iBusNumber;
+  int virtual_gpu;
+  int id;
+};
+
+// Memory allocation function
+static void * __stdcall ADL_Main_Memory_Alloc(int iSize)
+{
+  void *lpBuffer = malloc(iSize);
+
+  return lpBuffer;
+}
+
+// Optional Memory de-allocation function
+static void __stdcall ADL_Main_Memory_Free (void **lpBuffer)
+{
+  if (*lpBuffer != NULL) {
+    free (*lpBuffer);
+    *lpBuffer = NULL;
+  }
+}
+
+int getADLFanPercent(int deviceid) {
+  int fanPercent = 0;
+  fanPercent = getADLInfo(deviceid, "fanPercent");
+  return fanPercent;
+}
+
+int getADLTemp(int deviceid) {
+  int temp = 0;
+  temp = getADLInfo(deviceid, "temp");
+  return temp;
+}
+
+int getADLInfo(int deviceid, char field[64]) {
+  int result, i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0;
+  int iNumberAdapters;
+  struct gpu_adapters adapters[MAX_GPUDEVICES], vadapters[MAX_GPUDEVICES];
+  bool devs_match = true;
+  ADLBiosInfo BiosInfo;
+  LPAdapterInfo lpInfo = NULL;
+
+  if (ADL_OK != ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1)) {
+    return 0;
+  }
+
+  // Obtain the number of adapters for the system
+  result = ADL_Adapter_NumberOfAdapters_Get(&iNumberAdapters);
+  if (result != ADL_OK) {
+    return 0;
+  }
+
+  if (iNumberAdapters > 0) {
+    lpInfo = (LPAdapterInfo)malloc(sizeof (AdapterInfo) * iNumberAdapters);
+    memset ( lpInfo,'\0', sizeof (AdapterInfo) * iNumberAdapters );
+
+    lpInfo->iSize = sizeof(lpInfo);
+    // Get the AdapterInfo structure for all adapters in the system
+    result = ADL_Adapter_AdapterInfo_Get (lpInfo, sizeof (AdapterInfo) * iNumberAdapters);
+    if (result != ADL_OK) {
+      return 0;
+    }
+  } else {
+    return 0;
+  }
+
+  /* Iterate over iNumberAdapters and find the lpAdapterID of real devices */
+  for (i = 0; i < iNumberAdapters; i++) {
+    int iAdapterIndex;
+    int lpAdapterID;
+
+    iAdapterIndex = lpInfo[i].iAdapterIndex;
+
+    /* Get unique identifier of the adapter, 0 means not AMD */
+    result = ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID);
+
+    if (result != ADL_OK) {
+      continue;
+    }
+
+    /* Each adapter may have multiple entries */
+    if (lpAdapterID == last_adapter) {
+      continue;
+    }
+
+    adapters[devices].iAdapterIndex = iAdapterIndex;
+    adapters[devices].iBusNumber = lpInfo[i].iBusNumber;
+    adapters[devices].id = i;
+
+    if (deviceid == devices) {
+      if (strcmp(field, "fanPercent") == 0) {
+        ADLFanSpeedValue lpFanSpeedValue = {0};
+        lpFanSpeedValue.iSize = sizeof(ADLFanSpeedValue);
+        lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
+        if (ADL_OK != ADL_Overdrive5_FanSpeed_Get(iAdapterIndex, 0, &lpFanSpeedValue)) {
+          return 0;
+        }
+        return lpFanSpeedValue.iFanSpeed;
+      }
+      if (strcmp(field, "temp") == 0) {
+        ADLTemperature lpTemperature = {0};
+        lpTemperature.iSize = sizeof(ADLTemperature);
+        lpTemperature.iTemperature = 0;
+        if (ADL_OK != ADL_Overdrive5_Temperature_Get(iAdapterIndex, 0, &lpTemperature)) {
+          return 0;
+        }
+        return lpTemperature.iTemperature;
+      }
+    }
+
+    devices++;
+    last_adapter = lpAdapterID;
+
+    if (!lpAdapterID) {
+      continue;
+    }
+  }
+
+  return 0;
+}
diff --git a/adl/adl.go b/adl/adl.go
new file mode 100644
index 0000000..a480656
--- /dev/null
+++ b/adl/adl.go
@@ -0,0 +1,31 @@
+package adl
+
+/*
+// XXX all the C implementations use dlopen()
+#cgo linux CFLAGS: -DLINUX
+#cgo linux LDFLAGS: -latiadlxx -ldl
+#include <stddef.h>
+#include <stdbool.h>
+#include <adl_sdk.h>
+int getADLFanPercent(int deviceid);
+int getADLTemp(int deviceid);
+*/
+import "C"
+
+// DeviceFanPercent fetches and returns fan utilization for a device index
+func DeviceFanPercent(index int) uint32 {
+	fanPercent := uint32(0)
+
+	fanPercent = uint32(C.getADLFanPercent(C.int(index)))
+
+	return fanPercent
+}
+
+// DeviceTemperature fetches and returns temperature for a device index
+func DeviceTemperature(index int) uint32 {
+	temperature := uint32(0)
+
+	temperature = uint32(C.getADLTemp(C.int(index)))
+
+	return temperature
+}
diff --git a/adl/adl_defines.h b/adl/adl_defines.h
new file mode 100644
index 0000000..39f7421
--- /dev/null
+++ b/adl/adl_defines.h
@@ -0,0 +1,1984 @@
+//
+//  Copyright (c) 2008 - 2013 Advanced Micro Devices, Inc.
+
+//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
+//  EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
+//  WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+
+/// \file adl_defines.h
+/// \brief Contains all definitions exposed by ADL for \ALL platforms.\n <b>Included in ADL SDK</b>
+///
+/// This file contains all definitions used by ADL.
+/// The ADL definitions include the following:
+/// \li ADL error codes
+/// \li Enumerations for the ADLDisplayInfo structure
+/// \li Maximum limits
+///
+
+#ifndef ADL_DEFINES_H_
+#define ADL_DEFINES_H_
+
+/// \defgroup DEFINES Constants and Definitions
+// @{
+
+/// \defgroup define_misc Miscellaneous Constant Definitions
+// @{
+
+/// \name General Definitions
+// @{
+
+/// Defines ADL_TRUE
+#define ADL_TRUE	1
+/// Defines ADL_FALSE
+#define ADL_FALSE		0
+
+/// Defines the maximum string length
+#define ADL_MAX_CHAR                                    4096
+/// Defines the maximum string length
+#define ADL_MAX_PATH                                    256
+/// Defines the maximum number of supported adapters
+#define ADL_MAX_ADAPTERS                               250
+/// Defines the maxumum number of supported displays
+#define ADL_MAX_DISPLAYS                                150
+/// Defines the maxumum string length for device name
+#define ADL_MAX_DEVICENAME								32
+/// Defines for all adapters
+#define ADL_ADAPTER_INDEX_ALL							-1
+///	Defines APIs with iOption none
+#define ADL_MAIN_API_OPTION_NONE						0
+// @}
+
+/// \name Definitions for iOption parameter used by
+/// ADL_Display_DDCBlockAccess_Get()
+// @{
+
+/// Switch to DDC line 2 before sending the command to the display.
+#define ADL_DDC_OPTION_SWITCHDDC2              0x00000001
+/// Save command in the registry under a unique key, corresponding to parameter \b iCommandIndex
+#define ADL_DDC_OPTION_RESTORECOMMAND 0x00000002
+/// Combine write-read DDC block access command.
+#define ADL_DDC_OPTION_COMBOWRITEREAD 0x00000010
+/// Direct DDC access to the immediate device connected to graphics card.
+/// MST with this option set: DDC command is sent to first branch.
+/// MST with this option not set: DDC command is sent to the end node sink device.
+#define ADL_DDC_OPTION_SENDTOIMMEDIATEDEVICE 0x00000020
+// @}
+
+/// \name Values for
+/// ADLI2C.iAction used with ADL_Display_WriteAndReadI2C()
+// @{
+
+#define ADL_DL_I2C_ACTIONREAD									0x00000001
+#define ADL_DL_I2C_ACTIONWRITE								0x00000002
+#define ADL_DL_I2C_ACTIONREAD_REPEATEDSTART    0x00000003
+// @}
+
+
+// @}		//Misc
+
+/// \defgroup define_adl_results Result Codes
+/// This group of definitions are the various results returned by all ADL functions \n
+// @{
+/// All OK, but need to wait
+#define ADL_OK_WAIT				4
+/// All OK, but need restart
+#define ADL_OK_RESTART				3
+/// All OK but need mode change
+#define ADL_OK_MODE_CHANGE			2
+/// All OK, but with warning
+#define ADL_OK_WARNING				1
+/// ADL function completed successfully
+#define ADL_OK					0
+/// Generic Error. Most likely one or more of the Escape calls to the driver failed!
+#define ADL_ERR					-1
+/// ADL not initialized
+#define ADL_ERR_NOT_INIT			-2
+/// One of the parameter passed is invalid
+#define ADL_ERR_INVALID_PARAM			-3
+/// One of the parameter size is invalid
+#define ADL_ERR_INVALID_PARAM_SIZE		-4
+/// Invalid ADL index passed
+#define ADL_ERR_INVALID_ADL_IDX			-5
+/// Invalid controller index passed
+#define ADL_ERR_INVALID_CONTROLLER_IDX		-6
+/// Invalid display index passed
+#define ADL_ERR_INVALID_DIPLAY_IDX		-7
+/// Function  not supported by the driver
+#define ADL_ERR_NOT_SUPPORTED			-8
+/// Null Pointer error
+#define ADL_ERR_NULL_POINTER			-9
+/// Call can't be made due to disabled adapter
+#define ADL_ERR_DISABLED_ADAPTER		-10
+/// Invalid Callback
+#define ADL_ERR_INVALID_CALLBACK        	-11
+/// Display Resource conflict
+#define ADL_ERR_RESOURCE_CONFLICT				-12
+//Failed to update some of the values. Can be returned by set request that include multiple values if not all values were successfully committed.
+#define ADL_ERR_SET_INCOMPLETE 				-20
+/// There's no Linux XDisplay in Linux Console environment
+#define ADL_ERR_NO_XDISPLAY					-21
+
+// @}
+/// </A>
+
+/// \defgroup define_display_type Display Type
+/// Define Monitor/CRT display type
+// @{
+/// Define Monitor display type
+#define ADL_DT_MONITOR          		0
+/// Define TV display type
+#define ADL_DT_TELEVISION                	1
+/// Define LCD display type
+#define ADL_DT_LCD_PANEL               		2
+/// Define DFP display type
+#define ADL_DT_DIGITAL_FLAT_PANEL		3
+/// Define Componment Video display type
+#define ADL_DT_COMPONENT_VIDEO           	4
+/// Define Projector display type
+#define ADL_DT_PROJECTOR           	        5
+// @}
+
+/// \defgroup define_display_connection_type Display Connection Type
+// @{
+/// Define unknown display output type
+#define ADL_DOT_UNKNOWN				0
+/// Define composite display output type
+#define ADL_DOT_COMPOSITE			1
+/// Define SVideo display output type
+#define ADL_DOT_SVIDEO				2
+/// Define analog display output type
+#define ADL_DOT_ANALOG				3
+/// Define digital display output type
+#define ADL_DOT_DIGITAL				4
+// @}
+
+/// \defgroup define_color_type Display Color Type and Source
+/// Define  Display Color Type and Source
+// @{
+#define ADL_DISPLAY_COLOR_BRIGHTNESS	(1 << 0)
+#define ADL_DISPLAY_COLOR_CONTRAST	(1 << 1)
+#define ADL_DISPLAY_COLOR_SATURATION	(1 << 2)
+#define ADL_DISPLAY_COLOR_HUE		(1 << 3)
+#define ADL_DISPLAY_COLOR_TEMPERATURE	(1 << 4)
+
+/// Color Temperature Source is EDID
+#define ADL_DISPLAY_COLOR_TEMPERATURE_SOURCE_EDID	(1 << 5)
+/// Color Temperature Source is User
+#define ADL_DISPLAY_COLOR_TEMPERATURE_SOURCE_USER	(1 << 6)
+// @}
+
+/// \defgroup define_adjustment_capabilities Display Adjustment Capabilities
+/// Display adjustment capabilities values.  Returned by ADL_Display_AdjustCaps_Get
+// @{
+#define ADL_DISPLAY_ADJUST_OVERSCAN		(1 << 0)
+#define ADL_DISPLAY_ADJUST_VERT_POS		(1 << 1)
+#define ADL_DISPLAY_ADJUST_HOR_POS		(1 << 2)
+#define ADL_DISPLAY_ADJUST_VERT_SIZE		(1 << 3)
+#define ADL_DISPLAY_ADJUST_HOR_SIZE		(1 << 4)
+#define ADL_DISPLAY_ADJUST_SIZEPOS		(ADL_DISPLAY_ADJUST_VERT_POS | ADL_DISPLAY_ADJUST_HOR_POS | ADL_DISPLAY_ADJUST_VERT_SIZE | ADL_DISPLAY_ADJUST_HOR_SIZE)
+#define ADL_DISPLAY_CUSTOMMODES			(1<<5)
+#define ADL_DISPLAY_ADJUST_UNDERSCAN		(1<<6)
+// @}
+
+///Down-scale support
+#define ADL_DISPLAY_CAPS_DOWNSCALE		(1 << 0)
+
+/// Sharpness support
+#define ADL_DISPLAY_CAPS_SHARPNESS      (1 << 0)
+
+/// \defgroup define_desktop_config Desktop Configuration Flags
+/// These flags are used by ADL_DesktopConfig_xxx
+/// \deprecated This API has been deprecated because it was only used for RandR 1.1 (Red Hat 5.x) distributions which is now not supported.
+// @{
+#define ADL_DESKTOPCONFIG_UNKNOWN    0    	  /* UNKNOWN desktop config   */
+#define ADL_DESKTOPCONFIG_SINGLE     (1 <<  0)    /* Single                   */
+#define ADL_DESKTOPCONFIG_CLONE      (1 <<  2)    /* Clone                    */
+#define ADL_DESKTOPCONFIG_BIGDESK_H  (1 <<  4)    /* Big Desktop Horizontal   */
+#define ADL_DESKTOPCONFIG_BIGDESK_V  (1 <<  5)    /* Big Desktop Vertical     */
+#define ADL_DESKTOPCONFIG_BIGDESK_HR (1 <<  6)    /* Big Desktop Reverse Horz */
+#define ADL_DESKTOPCONFIG_BIGDESK_VR (1 <<  7)    /* Big Desktop Reverse Vert */
+#define ADL_DESKTOPCONFIG_RANDR12    (1 <<  8)    /* RandR 1.2 Multi-display */
+// @}
+
+/// needed for ADLDDCInfo structure
+#define ADL_MAX_DISPLAY_NAME                                256
+
+/// \defgroup define_edid_flags Values for ulDDCInfoFlag
+/// defines for ulDDCInfoFlag EDID flag
+// @{
+#define ADL_DISPLAYDDCINFOEX_FLAG_PROJECTORDEVICE       (1 << 0)
+#define ADL_DISPLAYDDCINFOEX_FLAG_EDIDEXTENSION         (1 << 1)
+#define ADL_DISPLAYDDCINFOEX_FLAG_DIGITALDEVICE         (1 << 2)
+#define ADL_DISPLAYDDCINFOEX_FLAG_HDMIAUDIODEVICE       (1 << 3)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORTS_AI           (1 << 4)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORT_xvYCC601      (1 << 5)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORT_xvYCC709      (1 << 6)
+// @}
+
+/// \defgroup define_displayinfo_connector Display Connector Type
+/// defines for ADLDisplayInfo.iDisplayConnector
+// @{
+#define ADL_DISPLAY_CONTYPE_UNKNOWN                 0
+#define ADL_DISPLAY_CONTYPE_VGA                     1
+#define ADL_DISPLAY_CONTYPE_DVI_D                   2
+#define ADL_DISPLAY_CONTYPE_DVI_I                   3
+#define ADL_DISPLAY_CONTYPE_ATICVDONGLE_NTSC        4
+#define ADL_DISPLAY_CONTYPE_ATICVDONGLE_JPN         5
+#define ADL_DISPLAY_CONTYPE_ATICVDONGLE_NONI2C_JPN  6
+#define ADL_DISPLAY_CONTYPE_ATICVDONGLE_NONI2C_NTSC 7
+#define ADL_DISPLAY_CONTYPE_PROPRIETARY				8
+#define ADL_DISPLAY_CONTYPE_HDMI_TYPE_A             10
+#define ADL_DISPLAY_CONTYPE_HDMI_TYPE_B             11
+#define ADL_DISPLAY_CONTYPE_SVIDEO               	12
+#define ADL_DISPLAY_CONTYPE_COMPOSITE               13
+#define ADL_DISPLAY_CONTYPE_RCA_3COMPONENT          14
+#define ADL_DISPLAY_CONTYPE_DISPLAYPORT             15
+#define ADL_DISPLAY_CONTYPE_EDP                     16
+#define ADL_DISPLAY_CONTYPE_WIRELESSDISPLAY         17
+// @}
+
+/// TV Capabilities and Standards
+/// \defgroup define_tv_caps TV Capabilities and Standards
+/// \deprecated Dropping support for TV displays
+// @{
+#define ADL_TV_STANDARDS			(1 << 0)
+#define ADL_TV_SCART				(1 << 1)
+
+/// TV Standards Definitions
+#define ADL_STANDARD_NTSC_M		(1 << 0)
+#define ADL_STANDARD_NTSC_JPN		(1 << 1)
+#define ADL_STANDARD_NTSC_N		(1 << 2)
+#define ADL_STANDARD_PAL_B		(1 << 3)
+#define ADL_STANDARD_PAL_COMB_N		(1 << 4)
+#define ADL_STANDARD_PAL_D		(1 << 5)
+#define ADL_STANDARD_PAL_G		(1 << 6)
+#define ADL_STANDARD_PAL_H		(1 << 7)
+#define ADL_STANDARD_PAL_I		(1 << 8)
+#define ADL_STANDARD_PAL_K		(1 << 9)
+#define ADL_STANDARD_PAL_K1		(1 << 10)
+#define ADL_STANDARD_PAL_L		(1 << 11)
+#define ADL_STANDARD_PAL_M		(1 << 12)
+#define ADL_STANDARD_PAL_N		(1 << 13)
+#define ADL_STANDARD_PAL_SECAM_D	(1 << 14)
+#define ADL_STANDARD_PAL_SECAM_K	(1 << 15)
+#define ADL_STANDARD_PAL_SECAM_K1	(1 << 16)
+#define ADL_STANDARD_PAL_SECAM_L	(1 << 17)
+// @}
+
+
+/// \defgroup define_video_custom_mode Video Custom Mode flags
+/// Component Video Custom Mode flags.  This is used by the iFlags parameter in ADLCustomMode
+// @{
+#define ADL_CUSTOMIZEDMODEFLAG_MODESUPPORTED	(1 << 0)
+#define ADL_CUSTOMIZEDMODEFLAG_NOTDELETETABLE	(1 << 1)
+#define ADL_CUSTOMIZEDMODEFLAG_INSERTBYDRIVER	(1 << 2)
+#define ADL_CUSTOMIZEDMODEFLAG_INTERLACED	(1 << 3)
+#define ADL_CUSTOMIZEDMODEFLAG_BASEMODE		(1 << 4)
+// @}
+
+/// \defgroup define_ddcinfoflag Values used for DDCInfoFlag
+/// ulDDCInfoFlag field values used by the ADLDDCInfo structure
+// @{
+#define ADL_DISPLAYDDCINFOEX_FLAG_PROJECTORDEVICE	(1 << 0)
+#define ADL_DISPLAYDDCINFOEX_FLAG_EDIDEXTENSION		(1 << 1)
+#define ADL_DISPLAYDDCINFOEX_FLAG_DIGITALDEVICE		(1 << 2)
+#define ADL_DISPLAYDDCINFOEX_FLAG_HDMIAUDIODEVICE	(1 << 3)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORTS_AI		(1 << 4)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORT_xvYCC601	(1 << 5)
+#define ADL_DISPLAYDDCINFOEX_FLAG_SUPPORT_xvYCC709	(1 << 6)
+// @}
+
+/// \defgroup define_cv_dongle Values used by ADL_CV_DongleSettings_xxx
+/// The following is applicable to ADL_DISPLAY_CONTYPE_ATICVDONGLE_JP and ADL_DISPLAY_CONTYPE_ATICVDONGLE_NONI2C_D only
+/// \deprecated Dropping support for Component Video displays
+// @{
+#define ADL_DISPLAY_CV_DONGLE_D1          (1 << 0)
+#define ADL_DISPLAY_CV_DONGLE_D2          (1 << 1)
+#define ADL_DISPLAY_CV_DONGLE_D3          (1 << 2)
+#define ADL_DISPLAY_CV_DONGLE_D4          (1 << 3)
+#define ADL_DISPLAY_CV_DONGLE_D5          (1 << 4)
+
+/// The following is applicable to ADL_DISPLAY_CONTYPE_ATICVDONGLE_NA and ADL_DISPLAY_CONTYPE_ATICVDONGLE_NONI2C only
+
+#define ADL_DISPLAY_CV_DONGLE_480I        (1 << 0)
+#define ADL_DISPLAY_CV_DONGLE_480P        (1 << 1)
+#define ADL_DISPLAY_CV_DONGLE_540P        (1 << 2)
+#define ADL_DISPLAY_CV_DONGLE_720P        (1 << 3)
+#define ADL_DISPLAY_CV_DONGLE_1080I       (1 << 4)
+#define ADL_DISPLAY_CV_DONGLE_1080P       (1 << 5)
+#define ADL_DISPLAY_CV_DONGLE_16_9        (1 << 6)
+#define ADL_DISPLAY_CV_DONGLE_720P50      (1 << 7)
+#define ADL_DISPLAY_CV_DONGLE_1080I25     (1 << 8)
+#define ADL_DISPLAY_CV_DONGLE_576I25      (1 << 9)
+#define ADL_DISPLAY_CV_DONGLE_576P50      (1 << 10)
+#define ADL_DISPLAY_CV_DONGLE_1080P24      (1 << 11)
+#define ADL_DISPLAY_CV_DONGLE_1080P25      (1 << 12)
+#define ADL_DISPLAY_CV_DONGLE_1080P30      (1 << 13)
+#define ADL_DISPLAY_CV_DONGLE_1080P50      (1 << 14)
+// @}
+
+/// \defgroup define_formats_ovr	Formats Override Settings
+/// Display force modes flags
+// @{
+///
+#define ADL_DISPLAY_FORMAT_FORCE_720P		0x00000001
+#define ADL_DISPLAY_FORMAT_FORCE_1080I		0x00000002
+#define ADL_DISPLAY_FORMAT_FORCE_1080P		0x00000004
+#define ADL_DISPLAY_FORMAT_FORCE_720P50		0x00000008
+#define ADL_DISPLAY_FORMAT_FORCE_1080I25	0x00000010
+#define ADL_DISPLAY_FORMAT_FORCE_576I25		0x00000020
+#define ADL_DISPLAY_FORMAT_FORCE_576P50		0x00000040
+#define ADL_DISPLAY_FORMAT_FORCE_1080P24	0x00000080
+#define ADL_DISPLAY_FORMAT_FORCE_1080P25	0x00000100
+#define ADL_DISPLAY_FORMAT_FORCE_1080P30	0x00000200
+#define ADL_DISPLAY_FORMAT_FORCE_1080P50	0x00000400
+
+///< Below are \b EXTENDED display mode flags
+
+#define ADL_DISPLAY_FORMAT_CVDONGLEOVERIDE  0x00000001
+#define ADL_DISPLAY_FORMAT_CVMODEUNDERSCAN  0x00000002
+#define ADL_DISPLAY_FORMAT_FORCECONNECT_SUPPORTED  0x00000004
+#define ADL_DISPLAY_FORMAT_RESTRICT_FORMAT_SELECTION 0x00000008
+#define ADL_DISPLAY_FORMAT_SETASPECRATIO 0x00000010
+#define ADL_DISPLAY_FORMAT_FORCEMODES    0x00000020
+#define ADL_DISPLAY_FORMAT_LCDRTCCOEFF   0x00000040
+// @}
+
+/// Defines used by OD5
+#define ADL_PM_PARAM_DONT_CHANGE    0
+
+/// The following defines Bus types
+// @{
+#define ADL_BUSTYPE_PCI           0       /* PCI bus                          */
+#define ADL_BUSTYPE_AGP           1       /* AGP bus                          */
+#define ADL_BUSTYPE_PCIE          2       /* PCI Express bus                  */
+#define ADL_BUSTYPE_PCIE_GEN2     3       /* PCI Express 2nd generation bus   */
+#define ADL_BUSTYPE_PCIE_GEN3     4       /* PCI Express 3rd generation bus   */
+// @}
+
+/// \defgroup define_ws_caps	Workstation Capabilities
+/// Workstation values
+// @{
+
+/// This value indicates that the workstation card supports active stereo though stereo output connector
+#define ADL_STEREO_SUPPORTED		(1 << 2)
+/// This value indicates that the workstation card supports active stereo via "blue-line"
+#define ADL_STEREO_BLUE_LINE		(1 << 3)
+/// This value is used to turn off stereo mode.
+#define ADL_STEREO_OFF				0
+/// This value indicates that the workstation card supports active stereo.  This is also used to set the stereo mode to active though the stereo output connector
+#define ADL_STEREO_ACTIVE	 		(1 << 1)
+/// This value indicates that the workstation card supports auto-stereo monitors with horizontal interleave. This is also used to set the stereo mode to use the auto-stereo monitor with horizontal interleave
+#define ADL_STEREO_AUTO_HORIZONTAL	(1 << 30)
+/// This value indicates that the workstation card supports auto-stereo monitors with vertical interleave. This is also used to set the stereo mode to use the auto-stereo monitor with vertical interleave
+#define ADL_STEREO_AUTO_VERTICAL	(1 << 31)
+/// This value indicates that the workstation card supports passive stereo, ie. non stereo sync
+#define ADL_STEREO_PASSIVE              (1 << 6)
+/// This value indicates that the workstation card supports auto-stereo monitors with vertical interleave. This is also used to set the stereo mode to use the auto-stereo monitor with vertical interleave
+#define ADL_STEREO_PASSIVE_HORIZ        (1 << 7)
+/// This value indicates that the workstation card supports auto-stereo monitors with vertical interleave. This is also used to set the stereo mode to use the auto-stereo monitor with vertical interleave
+#define ADL_STEREO_PASSIVE_VERT         (1 << 8)
+/// This value indicates that the workstation card supports auto-stereo monitors with Samsung.
+#define ADL_STEREO_AUTO_SAMSUNG        (1 << 11)
+/// This value indicates that the workstation card supports auto-stereo monitors with Tridility.
+#define ADL_STEREO_AUTO_TSL         (1 << 12)
+/// This value indicates that the workstation card supports DeepBitDepth (10 bpp)
+#define ADL_DEEPBITDEPTH_10BPP_SUPPORTED   (1 << 5)
+
+/// This value indicates that the workstation supports 8-Bit Grayscale
+#define ADL_8BIT_GREYSCALE_SUPPORTED   (1 << 9)
+/// This value indicates that the workstation supports CUSTOM TIMING
+#define ADL_CUSTOM_TIMING_SUPPORTED   (1 << 10)
+
+/// Load balancing is supported.
+#define ADL_WORKSTATION_LOADBALANCING_SUPPORTED         0x00000001
+/// Load balancing is available.
+#define ADL_WORKSTATION_LOADBALANCING_AVAILABLE         0x00000002
+
+/// Load balancing is disabled.
+#define ADL_WORKSTATION_LOADBALANCING_DISABLED          0x00000000
+/// Load balancing is Enabled.
+#define ADL_WORKSTATION_LOADBALANCING_ENABLED           0x00000001
+
+
+
+// @}
+
+/// \defgroup define_adapterspeed speed setting from the adapter
+// @{
+#define ADL_CONTEXT_SPEED_UNFORCED		0		/* default asic running speed */
+#define ADL_CONTEXT_SPEED_FORCEHIGH		1		/* asic running speed is forced to high */
+#define ADL_CONTEXT_SPEED_FORCELOW		2		/* asic running speed is forced to low */
+
+#define ADL_ADAPTER_SPEEDCAPS_SUPPORTED		(1 << 0)	/* change asic running speed setting is supported */
+// @}
+
+/// \defgroup define_glsync Genlock related values
+/// GL-Sync port types (unique values)
+// @{
+/// Unknown port of GL-Sync module
+#define ADL_GLSYNC_PORT_UNKNOWN		0
+/// BNC port of of GL-Sync module
+#define ADL_GLSYNC_PORT_BNC			1
+/// RJ45(1) port of of GL-Sync module
+#define ADL_GLSYNC_PORT_RJ45PORT1	2
+/// RJ45(2) port of of GL-Sync module
+#define ADL_GLSYNC_PORT_RJ45PORT2	3
+
+// GL-Sync Genlock settings mask (bit-vector)
+
+/// None of the ADLGLSyncGenlockConfig members are valid
+#define ADL_GLSYNC_CONFIGMASK_NONE				0
+/// The ADLGLSyncGenlockConfig.lSignalSource member is valid
+#define ADL_GLSYNC_CONFIGMASK_SIGNALSOURCE		(1 << 0)
+/// The ADLGLSyncGenlockConfig.iSyncField member is valid
+#define ADL_GLSYNC_CONFIGMASK_SYNCFIELD			(1 << 1)
+/// The ADLGLSyncGenlockConfig.iSampleRate member is valid
+#define ADL_GLSYNC_CONFIGMASK_SAMPLERATE		(1 << 2)
+/// The ADLGLSyncGenlockConfig.lSyncDelay member is valid
+#define ADL_GLSYNC_CONFIGMASK_SYNCDELAY			(1 << 3)
+/// The ADLGLSyncGenlockConfig.iTriggerEdge member is valid
+#define ADL_GLSYNC_CONFIGMASK_TRIGGEREDGE		(1 << 4)
+/// The ADLGLSyncGenlockConfig.iScanRateCoeff member is valid
+#define ADL_GLSYNC_CONFIGMASK_SCANRATECOEFF		(1 << 5)
+/// The ADLGLSyncGenlockConfig.lFramelockCntlVector member is valid
+#define ADL_GLSYNC_CONFIGMASK_FRAMELOCKCNTL		(1 << 6)
+
+
+// GL-Sync Framelock control mask (bit-vector)
+
+/// Framelock is disabled
+#define ADL_GLSYNC_FRAMELOCKCNTL_NONE			0
+/// Framelock is enabled
+#define ADL_GLSYNC_FRAMELOCKCNTL_ENABLE			( 1 << 0)
+
+#define ADL_GLSYNC_FRAMELOCKCNTL_DISABLE		( 1 << 1)
+#define ADL_GLSYNC_FRAMELOCKCNTL_SWAP_COUNTER_RESET	( 1 << 2)
+#define ADL_GLSYNC_FRAMELOCKCNTL_SWAP_COUNTER_ACK	( 1 << 3)
+#define ADL_GLSYNC_FRAMELOCKCNTL_VERSION_KMD	(1 << 4)
+
+#define ADL_GLSYNC_FRAMELOCKCNTL_STATE_ENABLE		( 1 << 0)
+#define ADL_GLSYNC_FRAMELOCKCNTL_STATE_KMD		(1 << 4)
+
+// GL-Sync Framelock counters mask (bit-vector)
+#define ADL_GLSYNC_COUNTER_SWAP				( 1 << 0 )
+
+// GL-Sync Signal Sources (unique values)
+
+/// GL-Sync signal source is undefined
+#define ADL_GLSYNC_SIGNALSOURCE_UNDEFINED    0x00000100
+/// GL-Sync signal source is Free Run
+#define ADL_GLSYNC_SIGNALSOURCE_FREERUN      0x00000101
+/// GL-Sync signal source is the BNC GL-Sync port
+#define ADL_GLSYNC_SIGNALSOURCE_BNCPORT      0x00000102
+/// GL-Sync signal source is the RJ45(1) GL-Sync port
+#define ADL_GLSYNC_SIGNALSOURCE_RJ45PORT1    0x00000103
+/// GL-Sync signal source is the RJ45(2) GL-Sync port
+#define ADL_GLSYNC_SIGNALSOURCE_RJ45PORT2    0x00000104
+
+
+// GL-Sync Signal Types (unique values)
+
+/// GL-Sync signal type is unknown
+#define ADL_GLSYNC_SIGNALTYPE_UNDEFINED      0
+/// GL-Sync signal type is 480I
+#define ADL_GLSYNC_SIGNALTYPE_480I           1
+/// GL-Sync signal type is 576I
+#define ADL_GLSYNC_SIGNALTYPE_576I           2
+/// GL-Sync signal type is 480P
+#define ADL_GLSYNC_SIGNALTYPE_480P           3
+/// GL-Sync signal type is 576P
+#define ADL_GLSYNC_SIGNALTYPE_576P           4
+/// GL-Sync signal type is 720P
+#define ADL_GLSYNC_SIGNALTYPE_720P           5
+/// GL-Sync signal type is 1080P
+#define ADL_GLSYNC_SIGNALTYPE_1080P          6
+/// GL-Sync signal type is 1080I
+#define ADL_GLSYNC_SIGNALTYPE_1080I          7
+/// GL-Sync signal type is SDI
+#define ADL_GLSYNC_SIGNALTYPE_SDI            8
+/// GL-Sync signal type is TTL
+#define ADL_GLSYNC_SIGNALTYPE_TTL            9
+/// GL_Sync signal type is Analog
+#define ADL_GLSYNC_SIGNALTYPE_ANALOG		10
+
+// GL-Sync Sync Field options (unique values)
+
+///GL-Sync sync field option is undefined
+#define ADL_GLSYNC_SYNCFIELD_UNDEFINED		0
+///GL-Sync sync field option is Sync to Field 1 (used for Interlaced signal types)
+#define ADL_GLSYNC_SYNCFIELD_BOTH			1
+///GL-Sync sync field option is Sync to Both fields (used for Interlaced signal types)
+#define ADL_GLSYNC_SYNCFIELD_1				2
+
+
+// GL-Sync trigger edge options (unique values)
+
+/// GL-Sync trigger edge is undefined
+#define ADL_GLSYNC_TRIGGEREDGE_UNDEFINED     0
+/// GL-Sync trigger edge is the rising edge
+#define ADL_GLSYNC_TRIGGEREDGE_RISING        1
+/// GL-Sync trigger edge is the falling edge
+#define ADL_GLSYNC_TRIGGEREDGE_FALLING       2
+/// GL-Sync trigger edge is both the rising and the falling edge
+#define ADL_GLSYNC_TRIGGEREDGE_BOTH          3
+
+
+// GL-Sync scan rate coefficient/multiplier options (unique values)
+
+/// GL-Sync scan rate coefficient/multiplier is undefined
+#define ADL_GLSYNC_SCANRATECOEFF_UNDEFINED   0
+/// GL-Sync scan rate coefficient/multiplier is 5
+#define ADL_GLSYNC_SCANRATECOEFF_x5          1
+/// GL-Sync scan rate coefficient/multiplier is 4
+#define ADL_GLSYNC_SCANRATECOEFF_x4          2
+/// GL-Sync scan rate coefficient/multiplier is 3
+#define ADL_GLSYNC_SCANRATECOEFF_x3          3
+/// GL-Sync scan rate coefficient/multiplier is 5:2 (SMPTE)
+#define ADL_GLSYNC_SCANRATECOEFF_x5_DIV_2    4
+/// GL-Sync scan rate coefficient/multiplier is 2
+#define ADL_GLSYNC_SCANRATECOEFF_x2          5
+/// GL-Sync scan rate coefficient/multiplier is 3 : 2
+#define ADL_GLSYNC_SCANRATECOEFF_x3_DIV_2    6
+/// GL-Sync scan rate coefficient/multiplier is 5 : 4
+#define ADL_GLSYNC_SCANRATECOEFF_x5_DIV_4    7
+/// GL-Sync scan rate coefficient/multiplier is 1 (default)
+#define ADL_GLSYNC_SCANRATECOEFF_x1          8
+/// GL-Sync scan rate coefficient/multiplier is 4 : 5
+#define ADL_GLSYNC_SCANRATECOEFF_x4_DIV_5    9
+/// GL-Sync scan rate coefficient/multiplier is 2 : 3
+#define ADL_GLSYNC_SCANRATECOEFF_x2_DIV_3    10
+/// GL-Sync scan rate coefficient/multiplier is 1 : 2
+#define ADL_GLSYNC_SCANRATECOEFF_x1_DIV_2    11
+/// GL-Sync scan rate coefficient/multiplier is 2 : 5 (SMPTE)
+#define ADL_GLSYNC_SCANRATECOEFF_x2_DIV_5    12
+/// GL-Sync scan rate coefficient/multiplier is 1 : 3
+#define ADL_GLSYNC_SCANRATECOEFF_x1_DIV_3    13
+/// GL-Sync scan rate coefficient/multiplier is 1 : 4
+#define ADL_GLSYNC_SCANRATECOEFF_x1_DIV_4    14
+/// GL-Sync scan rate coefficient/multiplier is 1 : 5
+#define ADL_GLSYNC_SCANRATECOEFF_x1_DIV_5    15
+
+
+// GL-Sync port (signal presence) states (unique values)
+
+/// GL-Sync port state is undefined
+#define ADL_GLSYNC_PORTSTATE_UNDEFINED       0
+/// GL-Sync port is not connected
+#define ADL_GLSYNC_PORTSTATE_NOCABLE         1
+/// GL-Sync port is Idle
+#define ADL_GLSYNC_PORTSTATE_IDLE            2
+/// GL-Sync port has an Input signal
+#define ADL_GLSYNC_PORTSTATE_INPUT           3
+/// GL-Sync port is Output
+#define ADL_GLSYNC_PORTSTATE_OUTPUT          4
+
+
+// GL-Sync LED types (used index within ADL_Workstation_GLSyncPortState_Get returned ppGlSyncLEDs array) (unique values)
+
+/// Index into the ADL_Workstation_GLSyncPortState_Get returned ppGlSyncLEDs array for the one LED of the BNC port
+#define ADL_GLSYNC_LEDTYPE_BNC               0
+/// Index into the ADL_Workstation_GLSyncPortState_Get returned ppGlSyncLEDs array for the Left LED of the RJ45(1) or RJ45(2) port
+#define ADL_GLSYNC_LEDTYPE_RJ45_LEFT         0
+/// Index into the ADL_Workstation_GLSyncPortState_Get returned ppGlSyncLEDs array for the Right LED of the RJ45(1) or RJ45(2) port
+#define ADL_GLSYNC_LEDTYPE_RJ45_RIGHT        1
+
+
+// GL-Sync LED colors (unique values)
+
+/// GL-Sync LED undefined color
+#define ADL_GLSYNC_LEDCOLOR_UNDEFINED        0
+/// GL-Sync LED is unlit
+#define ADL_GLSYNC_LEDCOLOR_NOLIGHT          1
+/// GL-Sync LED is yellow
+#define ADL_GLSYNC_LEDCOLOR_YELLOW           2
+/// GL-Sync LED is red
+#define ADL_GLSYNC_LEDCOLOR_RED              3
+/// GL-Sync LED is green
+#define ADL_GLSYNC_LEDCOLOR_GREEN            4
+/// GL-Sync LED is flashing green
+#define ADL_GLSYNC_LEDCOLOR_FLASH_GREEN      5
+
+
+// GL-Sync Port Control (refers one GL-Sync Port) (unique values)
+
+/// Used to configure the RJ54(1) or RJ42(2) port of GL-Sync is as Idle
+#define ADL_GLSYNC_PORTCNTL_NONE             0x00000000
+/// Used to configure the RJ54(1) or RJ42(2) port of GL-Sync is as Output
+#define ADL_GLSYNC_PORTCNTL_OUTPUT           0x00000001
+
+
+// GL-Sync Mode Control (refers one Display/Controller) (bitfields)
+
+/// Used to configure the display to use internal timing (not genlocked)
+#define ADL_GLSYNC_MODECNTL_NONE             0x00000000
+/// Bitfield used to configure the display as genlocked (either as Timing Client or as Timing Server)
+#define ADL_GLSYNC_MODECNTL_GENLOCK          0x00000001
+/// Bitfield used to configure the display as Timing Server
+#define ADL_GLSYNC_MODECNTL_TIMINGSERVER     0x00000002
+
+// GL-Sync Mode Status
+/// Display is currently not genlocked
+#define ADL_GLSYNC_MODECNTL_STATUS_NONE		 0x00000000
+/// Display is currently genlocked
+#define ADL_GLSYNC_MODECNTL_STATUS_GENLOCK   0x00000001
+/// Display requires a mode switch
+#define ADL_GLSYNC_MODECNTL_STATUS_SETMODE_REQUIRED 0x00000002
+/// Display is capable of being genlocked
+#define ADL_GLSYNC_MODECNTL_STATUS_GENLOCK_ALLOWED 0x00000004
+
+#define ADL_MAX_GLSYNC_PORTS							8
+#define ADL_MAX_GLSYNC_PORT_LEDS						8
+
+// @}
+
+/// \defgroup define_crossfirestate CrossfireX state of a particular adapter CrossfireX combination
+// @{
+#define ADL_XFIREX_STATE_NOINTERCONNECT			( 1 << 0 )	/* Dongle / cable is missing */
+#define ADL_XFIREX_STATE_DOWNGRADEPIPES			( 1 << 1 )	/* CrossfireX can be enabled if pipes are downgraded */
+#define ADL_XFIREX_STATE_DOWNGRADEMEM			( 1 << 2 )	/* CrossfireX cannot be enabled unless mem downgraded */
+#define ADL_XFIREX_STATE_REVERSERECOMMENDED		( 1 << 3 )	/* Card reversal recommended, CrossfireX cannot be enabled. */
+#define ADL_XFIREX_STATE_3DACTIVE			( 1 << 4 )	/* 3D client is active - CrossfireX cannot be safely enabled */
+#define ADL_XFIREX_STATE_MASTERONSLAVE			( 1 << 5 )	/* Dongle is OK but master is on slave */
+#define ADL_XFIREX_STATE_NODISPLAYCONNECT		( 1 << 6 )	/* No (valid) display connected to master card. */
+#define ADL_XFIREX_STATE_NOPRIMARYVIEW			( 1 << 7 )	/* CrossfireX is enabled but master is not current primary device */
+#define ADL_XFIREX_STATE_DOWNGRADEVISMEM		( 1 << 8 )	/* CrossfireX cannot be enabled unless visible mem downgraded */
+#define ADL_XFIREX_STATE_LESSTHAN8LANE_MASTER		( 1 << 9 ) 	/* CrossfireX can be enabled however performance not optimal due to <8 lanes */
+#define ADL_XFIREX_STATE_LESSTHAN8LANE_SLAVE		( 1 << 10 )	/* CrossfireX can be enabled however performance not optimal due to <8 lanes */
+#define ADL_XFIREX_STATE_PEERTOPEERFAILED		( 1 << 11 )	/* CrossfireX cannot be enabled due to failed peer to peer test */
+#define ADL_XFIREX_STATE_MEMISDOWNGRADED		( 1 << 16 )	/* Notification that memory is currently downgraded */
+#define ADL_XFIREX_STATE_PIPESDOWNGRADED		( 1 << 17 )	/* Notification that pipes are currently downgraded */
+#define ADL_XFIREX_STATE_XFIREXACTIVE			( 1 << 18 )	/* CrossfireX is enabled on current device */
+#define ADL_XFIREX_STATE_VISMEMISDOWNGRADED		( 1 << 19 )	/* Notification that visible FB memory is currently downgraded */
+#define ADL_XFIREX_STATE_INVALIDINTERCONNECTION		( 1 << 20 )	/* Cannot support current inter-connection configuration */
+#define ADL_XFIREX_STATE_NONP2PMODE			( 1 << 21 )	/* CrossfireX will only work with clients supporting non P2P mode */
+#define ADL_XFIREX_STATE_DOWNGRADEMEMBANKS		( 1 << 22 )	/* CrossfireX cannot be enabled unless memory banks downgraded */
+#define ADL_XFIREX_STATE_MEMBANKSDOWNGRADED		( 1 << 23 )	/* Notification that memory banks are currently downgraded */
+#define ADL_XFIREX_STATE_DUALDISPLAYSALLOWED		( 1 << 24 )	/* Extended desktop or clone mode is allowed. */
+#define ADL_XFIREX_STATE_P2P_APERTURE_MAPPING		( 1 << 25 )	/* P2P mapping was through peer aperture */
+#define ADL_XFIREX_STATE_P2PFLUSH_REQUIRED		ADL_XFIREX_STATE_P2P_APERTURE_MAPPING	/* For back compatible */
+#define ADL_XFIREX_STATE_XSP_CONNECTED			( 1 << 26 )	/* There is CrossfireX side port connection between GPUs */
+#define ADL_XFIREX_STATE_ENABLE_CF_REBOOT_REQUIRED	( 1 << 27 )	/* System needs a reboot bofore enable CrossfireX */
+#define ADL_XFIREX_STATE_DISABLE_CF_REBOOT_REQUIRED	( 1 << 28 )	/* System needs a reboot after disable CrossfireX */
+#define ADL_XFIREX_STATE_DRV_HANDLE_DOWNGRADE_KEY	( 1 << 29 )	/* Indicate base driver handles the downgrade key updating */
+#define ADL_XFIREX_STATE_CF_RECONFIG_REQUIRED		( 1 << 30 )	/* CrossfireX need to be reconfigured by CCC because of a LDA chain broken */
+#define ADL_XFIREX_STATE_ERRORGETTINGSTATUS		( 1 << 31 )	/* Could not obtain current status */
+// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_ADJUSTMENT_PIXELFORMAT adjustment values
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup define_pixel_formats Pixel Formats values
+/// This group defines the various Pixel Formats that a particular digital display can support. \n
+/// Since a display can support multiple formats, these values can be bit-or'ed to indicate the various formats \n
+// @{
+#define ADL_DISPLAY_PIXELFORMAT_UNKNOWN             0
+#define ADL_DISPLAY_PIXELFORMAT_RGB                       (1 << 0)
+#define ADL_DISPLAY_PIXELFORMAT_YCRCB444                  (1 << 1)    //Limited range
+#define ADL_DISPLAY_PIXELFORMAT_YCRCB422                 (1 << 2)    //Limited range
+#define ADL_DISPLAY_PIXELFORMAT_RGB_LIMITED_RANGE      (1 << 3)
+#define ADL_DISPLAY_PIXELFORMAT_RGB_FULL_RANGE    ADL_DISPLAY_PIXELFORMAT_RGB  //Full range
+// @}
+
+/// \defgroup define_contype Connector Type Values
+/// ADLDisplayConfig.ulConnectorType defines
+// @{
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_UNKNOWN      0
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_CV_NONI2C_JP 1
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_CV_JPN       2
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_CV_NA        3
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_CV_NONI2C_NA 4
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_VGA          5
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_DVI_D        6
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_DVI_I        7
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_HDMI_TYPE_A  8
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_HDMI_TYPE_B  9
+#define ADL_DL_DISPLAYCONFIG_CONTYPE_DISPLAYPORT  10
+// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_DISPLAYINFO_ Definitions
+// for ADLDisplayInfo.iDisplayInfoMask and ADLDisplayInfo.iDisplayInfoValue
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup define_displayinfomask Display Info Mask Values
+// @{
+#define ADL_DISPLAY_DISPLAYINFO_DISPLAYCONNECTED			0x00000001
+#define ADL_DISPLAY_DISPLAYINFO_DISPLAYMAPPED				0x00000002
+#define ADL_DISPLAY_DISPLAYINFO_NONLOCAL					0x00000004
+#define ADL_DISPLAY_DISPLAYINFO_FORCIBLESUPPORTED			0x00000008
+#define ADL_DISPLAY_DISPLAYINFO_GENLOCKSUPPORTED			0x00000010
+#define ADL_DISPLAY_DISPLAYINFO_MULTIVPU_SUPPORTED			0x00000020
+#define ADL_DISPLAY_DISPLAYINFO_LDA_DISPLAY					0x00000040
+#define ADL_DISPLAY_DISPLAYINFO_MODETIMING_OVERRIDESSUPPORTED			0x00000080
+
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_SINGLE			0x00000100
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_CLONE			0x00000200
+
+/// Legacy support for XP
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_2VSTRETCH		0x00000400
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_2HSTRETCH		0x00000800
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_EXTENDED		0x00001000
+
+/// More support manners
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_NSTRETCH1GPU	0x00010000
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_NSTRETCHNGPU	0x00020000
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_RESERVED2		0x00040000
+#define ADL_DISPLAY_DISPLAYINFO_MANNER_SUPPORTED_RESERVED3		0x00080000
+
+/// Projector display type
+#define ADL_DISPLAY_DISPLAYINFO_SHOWTYPE_PROJECTOR				0x00100000
+
+// @}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_ADAPTER_DISPLAY_MANNER_SUPPORTED_ Definitions
+// for ADLAdapterDisplayCap of ADL_Adapter_Display_Cap()
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup define_adaptermanner Adapter Manner Support Values
+// @{
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_NOTACTIVE		0x00000001
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_SINGLE			0x00000002
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_CLONE			0x00000004
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_NSTRETCH1GPU	0x00000008
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_NSTRETCHNGPU	0x00000010
+
+/// Legacy support for XP
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_2VSTRETCH		0x00000020
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_2HSTRETCH		0x00000040
+#define ADL_ADAPTER_DISPLAYCAP_MANNER_SUPPORTED_EXTENDED		0x00000080
+
+#define ADL_ADAPTER_DISPLAYCAP_PREFERDISPLAY_SUPPORTED			0x00000100
+#define ADL_ADAPTER_DISPLAYCAP_BEZEL_SUPPORTED					0x00000200
+
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_DISPLAYMAP_MANNER_ Definitions
+// for ADLDisplayMap.iDisplayMapMask and ADLDisplayMap.iDisplayMapValue
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_RESERVED			0x00000001
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_NOTACTIVE			0x00000002
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_SINGLE			0x00000004
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_CLONE				0x00000008
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_RESERVED1			0x00000010  // Removed NSTRETCH
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_HSTRETCH			0x00000020
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_VSTRETCH			0x00000040
+#define ADL_DISPLAY_DISPLAYMAP_MANNER_VLD				0x00000080
+
+// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_DISPLAYMAP_OPTION_ Definitions
+// for iOption in function ADL_Display_DisplayMapConfig_Get
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+#define ADL_DISPLAY_DISPLAYMAP_OPTION_GPUINFO			0x00000001
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_DISPLAYTARGET_ Definitions
+// for ADLDisplayTarget.iDisplayTargetMask and ADLDisplayTarget.iDisplayTargetValue
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+#define ADL_DISPLAY_DISPLAYTARGET_PREFERRED			0x00000001
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_POSSIBLEMAPRESULT_VALID Definitions
+// for ADLPossibleMapResult.iPossibleMapResultMask and ADLPossibleMapResult.iPossibleMapResultValue
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+#define ADL_DISPLAY_POSSIBLEMAPRESULT_VALID				0x00000001
+#define ADL_DISPLAY_POSSIBLEMAPRESULT_BEZELSUPPORTED	0x00000002
+#define ADL_DISPLAY_POSSIBLEMAPRESULT_OVERLAPSUPPORTED	0x00000004
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_DISPLAY_MODE_ Definitions
+// for ADLMode.iModeMask, ADLMode.iModeValue, and ADLMode.iModeFlag
+// (bit-vector)
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup define_displaymode Display Mode Values
+// @{
+#define ADL_DISPLAY_MODE_COLOURFORMAT_565				0x00000001
+#define ADL_DISPLAY_MODE_COLOURFORMAT_8888				0x00000002
+#define ADL_DISPLAY_MODE_ORIENTATION_SUPPORTED_000		0x00000004
+#define ADL_DISPLAY_MODE_ORIENTATION_SUPPORTED_090		0x00000008
+#define ADL_DISPLAY_MODE_ORIENTATION_SUPPORTED_180		0x00000010
+#define ADL_DISPLAY_MODE_ORIENTATION_SUPPORTED_270		0x00000020
+#define ADL_DISPLAY_MODE_REFRESHRATE_ROUNDED			0x00000040
+#define ADL_DISPLAY_MODE_REFRESHRATE_ONLY				0x00000080
+
+#define ADL_DISPLAY_MODE_PROGRESSIVE_FLAG	0
+#define ADL_DISPLAY_MODE_INTERLACED_FLAG	2
+// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADL_OSMODEINFO Definitions
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup define_osmode OS Mode Values
+// @{
+#define ADL_OSMODEINFOXPOS_DEFAULT				-640
+#define ADL_OSMODEINFOYPOS_DEFAULT				0
+#define ADL_OSMODEINFOXRES_DEFAULT				640
+#define ADL_OSMODEINFOYRES_DEFAULT				480
+#define ADL_OSMODEINFOXRES_DEFAULT800			800
+#define ADL_OSMODEINFOYRES_DEFAULT600			600
+#define ADL_OSMODEINFOREFRESHRATE_DEFAULT		60
+#define ADL_OSMODEINFOCOLOURDEPTH_DEFAULT		8
+#define ADL_OSMODEINFOCOLOURDEPTH_DEFAULT16		16
+#define ADL_OSMODEINFOCOLOURDEPTH_DEFAULT24		24
+#define ADL_OSMODEINFOCOLOURDEPTH_DEFAULT32		32
+#define ADL_OSMODEINFOORIENTATION_DEFAULT		0
+#define ADL_OSMODEINFOORIENTATION_DEFAULT_WIN7	DISPLAYCONFIG_ROTATION_FORCE_UINT32
+#define ADL_OSMODEFLAG_DEFAULT					0
+// @}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ADLThreadingModel Enumeration
+///////////////////////////////////////////////////////////////////////////
+/// \defgroup thread_model
+/// Used with \ref ADL_Main_ControlX2_Create and \ref ADL2_Main_ControlX2_Create to specify how ADL handles API calls when executed by multiple threads concurrently.
+/// \brief Declares ADL threading behavior.
+// @{
+typedef enum ADLThreadingModel
+{
+    ADL_THREADING_UNLOCKED    = 0, /*!< Default behavior. ADL will not enforce serialization of ADL API executions by multiple threads.  Multiple threads will be allowed to enter to ADL at the same time. Note that ADL library is not guaranteed to be thread-safe. Client that calls ADL_Main_Control_Create have to provide its own mechanism for ADL calls serialization. */
+    ADL_THREADING_LOCKED     /*!< ADL will enforce serialization of ADL API when called by multiple threads.  Only single thread will be allowed to enter ADL API at the time. This option makes ADL calls thread-safe. You shouldn't use this option if ADL calls will be executed on Linux on x-server rendering thread. It can cause the application to hung.  */
+}ADLThreadingModel;
+
+// @}
+///////////////////////////////////////////////////////////////////////////
+// ADLPurposeCode Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLPurposeCode
+{
+    ADL_PURPOSECODE_NORMAL	= 0,
+    ADL_PURPOSECODE_HIDE_MODE_SWITCH,
+    ADL_PURPOSECODE_MODE_SWITCH,
+    ADL_PURPOSECODE_ATTATCH_DEVICE,
+    ADL_PURPOSECODE_DETACH_DEVICE,
+    ADL_PURPOSECODE_SETPRIMARY_DEVICE,
+    ADL_PURPOSECODE_GDI_ROTATION,
+    ADL_PURPOSECODE_ATI_ROTATION
+};
+///////////////////////////////////////////////////////////////////////////
+// ADLAngle Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLAngle
+{
+    ADL_ANGLE_LANDSCAPE = 0,
+    ADL_ANGLE_ROTATERIGHT = 90,
+    ADL_ANGLE_ROTATE180 = 180,
+    ADL_ANGLE_ROTATELEFT = 270,
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLOrientationDataType Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLOrientationDataType
+{
+    ADL_ORIENTATIONTYPE_OSDATATYPE,
+    ADL_ORIENTATIONTYPE_NONOSDATATYPE
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLPanningMode Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLPanningMode
+{
+    ADL_PANNINGMODE_NO_PANNING = 0,
+    ADL_PANNINGMODE_AT_LEAST_ONE_NO_PANNING = 1,
+    ADL_PANNINGMODE_ALLOW_PANNING = 2,
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLLARGEDESKTOPTYPE Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLLARGEDESKTOPTYPE
+{
+    ADL_LARGEDESKTOPTYPE_NORMALDESKTOP = 0,
+    ADL_LARGEDESKTOPTYPE_PSEUDOLARGEDESKTOP = 1,
+    ADL_LARGEDESKTOPTYPE_VERYLARGEDESKTOP = 2
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLPlatform Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLPlatForm
+{
+    GRAPHICS_PLATFORM_DESKTOP  = 0,
+    GRAPHICS_PLATFORM_MOBILE   = 1
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLGraphicCoreGeneration Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLGraphicCoreGeneration
+{
+    ADL_GRAPHIC_CORE_GENERATION_UNDEFINED                   = 0,
+    ADL_GRAPHIC_CORE_GENERATION_PRE_GCN                     = 1,
+    ADL_GRAPHIC_CORE_GENERATION_GCN                         = 2
+};
+
+// Other Definitions for internal use
+
+// Values for ADL_Display_WriteAndReadI2CRev_Get()
+
+#define ADL_I2C_MAJOR_API_REV           0x00000001
+#define ADL_I2C_MINOR_DEFAULT_API_REV   0x00000000
+#define ADL_I2C_MINOR_OEM_API_REV       0x00000001
+
+// Values for ADL_Display_WriteAndReadI2C()
+#define ADL_DL_I2C_LINE_OEM                0x00000001
+#define ADL_DL_I2C_LINE_OD_CONTROL         0x00000002
+#define ADL_DL_I2C_LINE_OEM2               0x00000003
+#define ADL_DL_I2C_LINE_OEM3               0x00000004
+#define ADL_DL_I2C_LINE_OEM4               0x00000005
+#define ADL_DL_I2C_LINE_OEM5               0x00000006
+#define ADL_DL_I2C_LINE_OEM6               0x00000007
+
+// Max size of I2C data buffer
+#define ADL_DL_I2C_MAXDATASIZE             0x00000040
+#define ADL_DL_I2C_MAXWRITEDATASIZE        0x0000000C
+#define ADL_DL_I2C_MAXADDRESSLENGTH        0x00000006
+#define ADL_DL_I2C_MAXOFFSETLENGTH         0x00000004
+
+
+/// Values for ADLDisplayProperty.iPropertyType
+#define ADL_DL_DISPLAYPROPERTY_TYPE_UNKNOWN              0
+#define ADL_DL_DISPLAYPROPERTY_TYPE_EXPANSIONMODE        1
+#define ADL_DL_DISPLAYPROPERTY_TYPE_USEUNDERSCANSCALING	 2
+/// Enables ITC processing for HDMI panels that are capable of the feature
+#define ADL_DL_DISPLAYPROPERTY_TYPE_ITCFLAGENABLE        9
+#define ADL_DL_DISPLAYPROPERTY_TYPE_DOWNSCALE			11
+
+
+/// Values for ADLDisplayContent.iContentType
+/// Certain HDMI panels that support ITC have support for a feature such that, the display on the panel
+/// can be adjusted to optimize the view of the content being displayed, depending on the type of content.
+#define ADL_DL_DISPLAYCONTENT_TYPE_GRAPHICS		1
+#define ADL_DL_DISPLAYCONTENT_TYPE_PHOTO		2
+#define ADL_DL_DISPLAYCONTENT_TYPE_CINEMA		4
+#define ADL_DL_DISPLAYCONTENT_TYPE_GAME			8
+
+
+
+//values for ADLDisplayProperty.iExpansionMode
+#define ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_CENTER        0
+#define ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_FULLSCREEN    1
+#define ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_ASPECTRATIO   2
+
+
+///\defgroup define_dither_states Dithering options
+// @{
+/// Dithering disabled.
+#define ADL_DL_DISPLAY_DITHER_DISABLED              0
+/// Use default driver settings for dithering. Note that the default setting could be dithering disabled.
+#define ADL_DL_DISPLAY_DITHER_DRIVER_DEFAULT        1
+/// Temporal dithering to 6 bpc. Note that if the input is 12 bits, the two least significant bits will be truncated.
+#define ADL_DL_DISPLAY_DITHER_FM6                   2
+/// Temporal dithering to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_FM8                   3
+/// Temporal dithering to 10 bpc.
+#define ADL_DL_DISPLAY_DITHER_FM10                  4
+/// Spatial dithering to 6 bpc. Note that if the input is 12 bits, the two least significant bits will be truncated.
+#define ADL_DL_DISPLAY_DITHER_DITH6                 5
+/// Spatial dithering to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_DITH8                 6
+/// Spatial dithering to 10 bpc.
+#define ADL_DL_DISPLAY_DITHER_DITH10                7
+/// Spatial dithering to 6 bpc. Random number generators are reset every frame, so the same input value of a certain pixel will always be dithered to the same output value. Note that if the input is 12 bits, the two least significant bits will be truncated.
+#define ADL_DL_DISPLAY_DITHER_DITH6_NO_FRAME_RAND   8
+/// Spatial dithering to 8 bpc. Random number generators are reset every frame, so the same input value of a certain pixel will always be dithered to the same output value.
+#define ADL_DL_DISPLAY_DITHER_DITH8_NO_FRAME_RAND   9
+/// Spatial dithering to 10 bpc. Random number generators are reset every frame, so the same input value of a certain pixel will always be dithered to the same output value.
+#define ADL_DL_DISPLAY_DITHER_DITH10_NO_FRAME_RAND  10
+/// Truncation to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN6                 11
+/// Truncation to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN8                 12
+/// Truncation to 10 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10                13
+/// Truncation to 10 bpc followed by spatial dithering to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10_DITH8          14
+/// Truncation to 10 bpc followed by spatial dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10_DITH6          15
+/// Truncation to 10 bpc followed by temporal dithering to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10_FM8            16
+/// Truncation to 10 bpc followed by temporal dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10_FM6            17
+/// Truncation to 10 bpc followed by spatial dithering to 8 bpc and temporal dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN10_DITH8_FM6      18
+/// Spatial dithering to 10 bpc followed by temporal dithering to 8 bpc.
+#define ADL_DL_DISPLAY_DITHER_DITH10_FM8            19
+/// Spatial dithering to 10 bpc followed by temporal dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_DITH10_FM6            20
+/// Truncation to 8 bpc followed by spatial dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN8_DITH6           21
+/// Truncation to 8 bpc followed by temporal dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_TRUN8_FM6             22
+/// Spatial dithering to 8 bpc followed by temporal dithering to 6 bpc.
+#define ADL_DL_DISPLAY_DITHER_DITH8_FM6             23
+#define ADL_DL_DISPLAY_DITHER_LAST                  ADL_DL_DISPLAY_DITHER_DITH8_FM6
+// @}
+
+
+/// Display Get Cached EDID flag
+#define ADL_MAX_EDIDDATA_SIZE              256 // number of UCHAR
+#define ADL_MAX_OVERRIDEEDID_SIZE          512 // number of UCHAR
+#define ADL_MAX_EDID_EXTENSION_BLOCKS      3
+
+#define ADL_DL_CONTROLLER_OVERLAY_ALPHA         0
+#define ADL_DL_CONTROLLER_OVERLAY_ALPHAPERPIX   1
+
+#define ADL_DL_DISPLAY_DATA_PACKET__INFO_PACKET_RESET      0x00000000
+#define ADL_DL_DISPLAY_DATA_PACKET__INFO_PACKET_SET        0x00000001
+#define ADL_DL_DISPLAY_DATA_PACKET__INFO_PACKET_SCAN       0x00000002
+
+///\defgroup define_display_packet Display Data Packet Types
+// @{
+#define ADL_DL_DISPLAY_DATA_PACKET__TYPE__AVI              0x00000001
+#define ADL_DL_DISPLAY_DATA_PACKET__TYPE__GAMMUT           0x00000002
+#define ADL_DL_DISPLAY_DATA_PACKET__TYPE__VENDORINFO       0x00000004
+#define ADL_DL_DISPLAY_DATA_PACKET__TYPE__HDR              0x00000008
+#define ADL_DL_DISPLAY_DATA_PACKET__TYPE__SPD              0x00000010
+// @}
+
+// matrix types
+#define ADL_GAMUT_MATRIX_SD         1   // SD matrix i.e. BT601
+#define ADL_GAMUT_MATRIX_HD         2   // HD matrix i.e. BT709
+
+///\defgroup define_clockinfo_flags Clock flags
+/// Used by ADLAdapterODClockInfo.iFlag
+// @{
+#define ADL_DL_CLOCKINFO_FLAG_FULLSCREEN3DONLY         0x00000001
+#define ADL_DL_CLOCKINFO_FLAG_ALWAYSFULLSCREEN3D       0x00000002
+#define ADL_DL_CLOCKINFO_FLAG_VPURECOVERYREDUCED       0x00000004
+#define ADL_DL_CLOCKINFO_FLAG_THERMALPROTECTION        0x00000008
+// @}
+
+// Supported GPUs
+// ADL_Display_PowerXpressActiveGPU_Get()
+#define ADL_DL_POWERXPRESS_GPU_INTEGRATED		1
+#define ADL_DL_POWERXPRESS_GPU_DISCRETE			2
+
+// Possible values for lpOperationResult
+// ADL_Display_PowerXpressActiveGPU_Get()
+#define ADL_DL_POWERXPRESS_SWITCH_RESULT_STARTED         1 // Switch procedure has been started - Windows platform only
+#define ADL_DL_POWERXPRESS_SWITCH_RESULT_DECLINED        2 // Switch procedure cannot be started - All platforms
+#define ADL_DL_POWERXPRESS_SWITCH_RESULT_ALREADY         3 // System already has required status  - All platforms
+#define ADL_DL_POWERXPRESS_SWITCH_RESULT_DEFERRED        5  // Switch was deferred and requires an X restart - Linux platform only
+
+// PowerXpress support version
+// ADL_Display_PowerXpressVersion_Get()
+#define ADL_DL_POWERXPRESS_VERSION_MAJOR			2	// Current PowerXpress support version 2.0
+#define ADL_DL_POWERXPRESS_VERSION_MINOR			0
+
+#define ADL_DL_POWERXPRESS_VERSION	(((ADL_DL_POWERXPRESS_VERSION_MAJOR) << 16) | ADL_DL_POWERXPRESS_VERSION_MINOR)
+
+//values for ADLThermalControllerInfo.iThermalControllerDomain
+#define ADL_DL_THERMAL_DOMAIN_OTHER      0
+#define ADL_DL_THERMAL_DOMAIN_GPU        1
+
+//values for ADLThermalControllerInfo.iFlags
+#define ADL_DL_THERMAL_FLAG_INTERRUPT    1
+#define ADL_DL_THERMAL_FLAG_FANCONTROL   2
+
+///\defgroup define_fanctrl Fan speed cotrol
+/// Values for ADLFanSpeedInfo.iFlags
+// @{
+#define ADL_DL_FANCTRL_SUPPORTS_PERCENT_READ     1
+#define ADL_DL_FANCTRL_SUPPORTS_PERCENT_WRITE    2
+#define ADL_DL_FANCTRL_SUPPORTS_RPM_READ         4
+#define ADL_DL_FANCTRL_SUPPORTS_RPM_WRITE        8
+// @}
+
+//values for ADLFanSpeedValue.iSpeedType
+#define ADL_DL_FANCTRL_SPEED_TYPE_PERCENT    1
+#define ADL_DL_FANCTRL_SPEED_TYPE_RPM        2
+
+//values for ADLFanSpeedValue.iFlags
+#define ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED   1
+
+// MVPU interfaces
+#define ADL_DL_MAX_MVPU_ADAPTERS   4
+#define MVPU_ADAPTER_0	      0x00000001
+#define MVPU_ADAPTER_1		  0x00000002
+#define MVPU_ADAPTER_2		  0x00000004
+#define MVPU_ADAPTER_3		  0x00000008
+#define ADL_DL_MAX_REGISTRY_PATH   256
+
+//values for ADLMVPUStatus.iStatus
+#define ADL_DL_MVPU_STATUS_OFF   0
+#define ADL_DL_MVPU_STATUS_ON    1
+
+// values for ASIC family
+///\defgroup define_Asic_type Detailed asic types
+/// Defines for Adapter ASIC family type
+// @{
+#define ADL_ASIC_UNDEFINED	0
+#define ADL_ASIC_DISCRETE	(1 << 0)
+#define ADL_ASIC_INTEGRATED	(1 << 1)
+#define ADL_ASIC_FIREGL		(1 << 2)
+#define ADL_ASIC_FIREMV		(1 << 3)
+#define ADL_ASIC_XGP		(1 << 4)
+#define ADL_ASIC_FUSION		(1 << 5)
+#define ADL_ASIC_FIRESTREAM (1 << 6)
+#define ADL_ASIC_EMBEDDED   (1 << 7)
+// @}
+
+///\defgroup define_detailed_timing_flags Detailed Timimg Flags
+/// Defines for ADLDetailedTiming.sTimingFlags field
+// @{
+#define ADL_DL_TIMINGFLAG_DOUBLE_SCAN              0x0001
+//sTimingFlags is set when the mode is INTERLACED, if not PROGRESSIVE
+#define ADL_DL_TIMINGFLAG_INTERLACED               0x0002
+//sTimingFlags is set when the Horizontal Sync is POSITIVE, if not NEGATIVE
+#define ADL_DL_TIMINGFLAG_H_SYNC_POLARITY          0x0004
+//sTimingFlags is set when the Vertical Sync is POSITIVE, if not NEGATIVE
+#define ADL_DL_TIMINGFLAG_V_SYNC_POLARITY          0x0008
+// @}
+
+///\defgroup define_modetiming_standard Timing Standards
+/// Defines for ADLDisplayModeInfo.iTimingStandard field
+// @{
+#define ADL_DL_MODETIMING_STANDARD_CVT             0x00000001 // CVT Standard
+#define ADL_DL_MODETIMING_STANDARD_GTF             0x00000002 // GFT Standard
+#define ADL_DL_MODETIMING_STANDARD_DMT             0x00000004 // DMT Standard
+#define ADL_DL_MODETIMING_STANDARD_CUSTOM          0x00000008 // User-defined standard
+#define ADL_DL_MODETIMING_STANDARD_DRIVER_DEFAULT  0x00000010 // Remove Mode from overriden list
+#define ADL_DL_MODETIMING_STANDARD_CVT_RB		   0x00000020 // CVT-RB Standard
+// @}
+
+// \defgroup define_xserverinfo driver x-server info
+/// These flags are used by ADL_XServerInfo_Get()
+// @
+
+/// Xinerama is active in the x-server, Xinerama extension may report it to be active but it
+/// may not be active in x-server
+#define ADL_XSERVERINFO_XINERAMAACTIVE            (1<<0)
+
+/// RandR 1.2 is supported by driver, RandR extension may report version 1.2
+/// but driver may not support it
+#define ADL_XSERVERINFO_RANDR12SUPPORTED          (1<<1)
+// @
+
+
+///\defgroup define_eyefinity_constants Eyefinity Definitions
+// @{
+
+#define ADL_CONTROLLERVECTOR_0		1	// ADL_CONTROLLERINDEX_0 = 0, (1 << ADL_CONTROLLERINDEX_0)
+#define ADL_CONTROLLERVECTOR_1		2	// ADL_CONTROLLERINDEX_1 = 1, (1 << ADL_CONTROLLERINDEX_1)
+
+#define ADL_DISPLAY_SLSGRID_ORIENTATION_000		0x00000001
+#define ADL_DISPLAY_SLSGRID_ORIENTATION_090		0x00000002
+#define ADL_DISPLAY_SLSGRID_ORIENTATION_180		0x00000004
+#define ADL_DISPLAY_SLSGRID_ORIENTATION_270		0x00000008
+#define ADL_DISPLAY_SLSGRID_CAP_OPTION_RELATIVETO_LANDSCAPE 	0x00000001
+#define ADL_DISPLAY_SLSGRID_CAP_OPTION_RELATIVETO_CURRENTANGLE 	0x00000002
+#define ADL_DISPLAY_SLSGRID_PORTAIT_MODE 						0x00000004
+#define ADL_DISPLAY_SLSGRID_KEEPTARGETROTATION              	0x00000080
+
+#define ADL_DISPLAY_SLSGRID_SAMEMODESLS_SUPPORT		0x00000010
+#define ADL_DISPLAY_SLSGRID_MIXMODESLS_SUPPORT		0x00000020
+#define ADL_DISPLAY_SLSGRID_DISPLAYROTATION_SUPPORT	0x00000040
+#define ADL_DISPLAY_SLSGRID_DESKTOPROTATION_SUPPORT	0x00000080
+
+
+#define ADL_DISPLAY_SLSMAP_SLSLAYOUTMODE_FIT        0x0100
+#define ADL_DISPLAY_SLSMAP_SLSLAYOUTMODE_FILL       0x0200
+#define ADL_DISPLAY_SLSMAP_SLSLAYOUTMODE_EXPAND     0x0400
+
+#define ADL_DISPLAY_SLSMAP_IS_SLS        0x1000
+#define ADL_DISPLAY_SLSMAP_IS_SLSBUILDER 0x2000
+#define ADL_DISPLAY_SLSMAP_IS_CLONEVT     0x4000
+
+#define ADL_DISPLAY_SLSMAPCONFIG_GET_OPTION_RELATIVETO_LANDSCAPE 		0x00000001
+#define ADL_DISPLAY_SLSMAPCONFIG_GET_OPTION_RELATIVETO_CURRENTANGLE 	0x00000002
+
+#define ADL_DISPLAY_SLSMAPCONFIG_CREATE_OPTION_RELATIVETO_LANDSCAPE 		0x00000001
+#define ADL_DISPLAY_SLSMAPCONFIG_CREATE_OPTION_RELATIVETO_CURRENTANGLE 	0x00000002
+
+#define ADL_DISPLAY_SLSMAPCONFIG_REARRANGE_OPTION_RELATIVETO_LANDSCAPE 	0x00000001
+#define ADL_DISPLAY_SLSMAPCONFIG_REARRANGE_OPTION_RELATIVETO_CURRENTANGLE 	0x00000002
+
+#define ADL_SLS_SAMEMODESLS_SUPPORT         0x0001
+#define ADL_SLS_MIXMODESLS_SUPPORT          0x0002
+#define ADL_SLS_DISPLAYROTATIONSLS_SUPPORT  0x0004
+#define ADL_SLS_DESKTOPROTATIONSLS_SUPPORT  0x0008
+
+#define ADL_SLS_TARGETS_INVALID     0x0001
+#define ADL_SLS_MODES_INVALID       0x0002
+#define ADL_SLS_ROTATIONS_INVALID   0x0004
+#define ADL_SLS_POSITIONS_INVALID   0x0008
+#define ADL_SLS_LAYOUTMODE_INVALID  0x0010
+
+#define ADL_DISPLAY_SLSDISPLAYOFFSET_VALID        0x0002
+
+#define ADL_DISPLAY_SLSGRID_RELATIVETO_LANDSCAPE 		0x00000010
+#define ADL_DISPLAY_SLSGRID_RELATIVETO_CURRENTANGLE 	0x00000020
+
+
+/// The bit mask identifies displays is currently in bezel mode.
+#define ADL_DISPLAY_SLSMAP_BEZELMODE			0x00000010
+/// The bit mask identifies displays from this map is arranged.
+#define ADL_DISPLAY_SLSMAP_DISPLAYARRANGED		0x00000002
+/// The bit mask identifies this map is currently in used for the current adapter.
+#define ADL_DISPLAY_SLSMAP_CURRENTCONFIG		0x00000004
+
+///For onlay active SLS  map info
+#define ADL_DISPLAY_SLSMAPINDEXLIST_OPTION_ACTIVE		0x00000001
+
+///For Bezel
+#define ADL_DISPLAY_BEZELOFFSET_STEPBYSTEPSET			0x00000004
+#define ADL_DISPLAY_BEZELOFFSET_COMMIT					0x00000008
+
+typedef enum _SLS_ImageCropType {
+    Fit = 1,
+    Fill = 2,
+    Expand = 3
+}SLS_ImageCropType;
+
+
+typedef enum _DceSettingsType {
+    DceSetting_HdmiLq,
+    DceSetting_DpSettings,
+    DceSetting_Protection
+} DceSettingsType;
+
+typedef enum _DpLinkRate {
+    DPLinkRate_RBR,
+    DPLinkRate_HBR,
+    DPLinkRate_HBR2,
+    DPLinkRate_HBR3
+} DpLinkRate;
+
+// @}
+
+///\defgroup define_powerxpress_constants PowerXpress Definitions
+/// @{
+
+/// The bit mask identifies PX caps for ADLPXConfigCaps.iPXConfigCapMask and ADLPXConfigCaps.iPXConfigCapValue
+#define	ADL_PX_CONFIGCAPS_SPLASHSCREEN_SUPPORT		0x0001
+#define	ADL_PX_CONFIGCAPS_CF_SUPPORT				0x0002
+#define	ADL_PX_CONFIGCAPS_MUXLESS					0x0004
+#define	ADL_PX_CONFIGCAPS_PROFILE_COMPLIANT			0x0008
+#define	ADL_PX_CONFIGCAPS_NON_AMD_DRIVEN_DISPLAYS	0x0010
+#define ADL_PX_CONFIGCAPS_FIXED_SUPPORT             0x0020
+#define ADL_PX_CONFIGCAPS_DYNAMIC_SUPPORT           0x0040
+#define ADL_PX_CONFIGCAPS_HIDE_AUTO_SWITCH			0x0080
+
+/// The bit mask identifies PX schemes for ADLPXSchemeRange
+#define ADL_PX_SCHEMEMASK_FIXED						0x0001
+#define ADL_PX_SCHEMEMASK_DYNAMIC					0x0002
+
+/// PX Schemes
+typedef enum _ADLPXScheme
+{
+    ADL_PX_SCHEME_INVALID   = 0,
+    ADL_PX_SCHEME_FIXED     = ADL_PX_SCHEMEMASK_FIXED,
+    ADL_PX_SCHEME_DYNAMIC   = ADL_PX_SCHEMEMASK_DYNAMIC
+}ADLPXScheme;
+
+/// Just keep the old definitions for compatibility, need to be removed later
+typedef enum PXScheme
+{
+    PX_SCHEME_INVALID   = 0,
+    PX_SCHEME_FIXED     = 1,
+    PX_SCHEME_DYNAMIC   = 2
+} PXScheme;
+
+
+/// @}
+
+///\defgroup define_appprofiles For Application Profiles
+/// @{
+
+#define ADL_APP_PROFILE_FILENAME_LENGTH		64
+#define ADL_APP_PROFILE_TIMESTAMP_LENGTH	32
+#define ADL_APP_PROFILE_VERSION_LENGTH		32
+#define ADL_APP_PROFILE_PROPERTY_LENGTH		64
+
+enum ApplicationListType
+{
+    ADL_PX40_MRU,
+    ADL_PX40_MISSED,
+    ADL_PX40_DISCRETE,
+    ADL_PX40_INTEGRATED,
+	ADL_MMD_PROFILED,
+    ADL_PX40_TOTAL
+};
+
+typedef enum _ADLProfilePropertyType
+{
+    ADL_PROFILEPROPERTY_TYPE_BINARY		= 0,
+    ADL_PROFILEPROPERTY_TYPE_BOOLEAN,
+    ADL_PROFILEPROPERTY_TYPE_DWORD,
+    ADL_PROFILEPROPERTY_TYPE_QWORD,
+    ADL_PROFILEPROPERTY_TYPE_ENUMERATED,
+    ADL_PROFILEPROPERTY_TYPE_STRING
+}ADLProfilePropertyType;
+
+
+/// @}
+
+///\defgroup define_dp12 For Display Port 1.2
+/// @{
+
+/// Maximum Relative Address Link
+#define ADL_MAX_RAD_LINK_COUNT	15
+
+/// @}
+
+///\defgroup defines_gamutspace Driver Supported Gamut Space
+/// @{
+
+/// The flags desribes that gamut is related to source or to destination and to overlay or to graphics
+#define ADL_GAMUT_REFERENCE_SOURCE       (1 << 0)
+#define ADL_GAMUT_GAMUT_VIDEO_CONTENT    (1 << 1)
+
+/// The flags are used to describe the source of gamut and how read information from struct ADLGamutData
+#define ADL_CUSTOM_WHITE_POINT           (1 << 0)
+#define ADL_CUSTOM_GAMUT                 (1 << 1)
+#define ADL_GAMUT_REMAP_ONLY             (1 << 2)
+
+/// The define means the predefined gamut values  .
+///Driver uses to find entry in the table and apply appropriate gamut space.
+#define ADL_GAMUT_SPACE_CCIR_709     (1 << 0)
+#define ADL_GAMUT_SPACE_CCIR_601     (1 << 1)
+#define ADL_GAMUT_SPACE_ADOBE_RGB    (1 << 2)
+#define ADL_GAMUT_SPACE_CIE_RGB      (1 << 3)
+#define ADL_GAMUT_SPACE_CUSTOM       (1 << 4)
+#define ADL_GAMUT_SPACE_CCIR_2020    (1 << 5)
+#define ADL_GAMUT_SPACE_APPCTRL      (1 << 6)
+
+/// Predefine white point values are structed similar to gamut .
+#define ADL_WHITE_POINT_5000K       (1 << 0)
+#define ADL_WHITE_POINT_6500K       (1 << 1)
+#define ADL_WHITE_POINT_7500K       (1 << 2)
+#define ADL_WHITE_POINT_9300K       (1 << 3)
+#define ADL_WHITE_POINT_CUSTOM      (1 << 4)
+
+///gamut and white point coordinates are from 0.0 -1.0 and divider is used to find the real value .
+/// X float = X int /divider
+#define ADL_GAMUT_WHITEPOINT_DIVIDER           10000
+
+///gamma a0 coefficient uses the following divider:
+#define ADL_REGAMMA_COEFFICIENT_A0_DIVIDER       10000000
+///gamma a1 ,a2,a3 coefficients use the following divider:
+#define ADL_REGAMMA_COEFFICIENT_A1A2A3_DIVIDER   1000
+
+///describes whether the coefficients are from EDID or custom user values.
+#define ADL_EDID_REGAMMA_COEFFICIENTS          (1 << 0)
+///Used for struct ADLRegamma. Feature if set use gamma ramp, if missing use regamma coefficents
+#define ADL_USE_GAMMA_RAMP                     (1 << 4)
+///Used for struct ADLRegamma. If the gamma ramp flag is used then the driver could apply de gamma corretion to the supplied curve and this depends on this flag
+#define ADL_APPLY_DEGAMMA                      (1 << 5)
+///specifies that standard SRGB gamma should be applied
+#define ADL_EDID_REGAMMA_PREDEFINED_SRGB       (1 << 1)
+///specifies that PQ gamma curve should be applied
+#define ADL_EDID_REGAMMA_PREDEFINED_PQ         (1 << 2)
+///specifies that PQ gamma curve should be applied, lower max nits
+#define ADL_EDID_REGAMMA_PREDEFINED_PQ_2084_INTERIM (1 << 3)
+///specifies that 3.6 gamma should be applied
+#define ADL_EDID_REGAMMA_PREDEFINED_36         (1 << 6)
+///specifies that BT709 gama should be applied
+#define ADL_EDID_REGAMMA_PREDEFINED_BT709      (1 << 7)
+///specifies that regamma should be disabled, and application controls regamma content (of the whole screen)
+#define ADL_EDID_REGAMMA_PREDEFINED_APPCTRL    (1 << 8)
+
+/// @}
+
+/// \defgroup define_ddcinfo_pixelformats DDCInfo Pixel Formats
+/// @{
+/// defines for iPanelPixelFormat  in struct ADLDDCInfo2
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB656                       0x00000001L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB666                       0x00000002L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB888                       0x00000004L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB101010                    0x00000008L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB161616                    0x00000010L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB_RESERVED1                0x00000020L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB_RESERVED2                0x00000040L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_RGB_RESERVED3                0x00000080L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_XRGB_BIAS101010              0x00000100L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR444_8BPCC               0x00000200L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR444_10BPCC              0x00000400L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR444_12BPCC              0x00000800L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR422_8BPCC               0x00001000L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR422_10BPCC              0x00002000L
+#define ADL_DISPLAY_DDCINFO_PIXEL_FORMAT_YCBCR422_12BPCC              0x00004000L
+/// @}
+
+/// \defgroup define_source_content_TF ADLSourceContentAttributes transfer functions (gamma)
+/// @{
+/// defines for iTransferFunction in ADLSourceContentAttributes
+#define ADL_TF_sRGB				0x0001      ///< sRGB
+#define ADL_TF_BT709			0x0002      ///< BT.709
+#define ADL_TF_PQ2084			0x0004      ///< PQ2084
+#define ADL_TF_PQ2084_INTERIM	0x0008	    ///< PQ2084-Interim
+#define ADL_TF_LINEAR_0_1		0x0010      ///< Linear 0 - 1
+#define ADL_TF_LINEAR_0_125		0x0020      ///< Linear 0 - 125
+#define ADL_TF_DOLBYVISION		0x0040      ///< DolbyVision
+/// @}
+
+/// \defgroup define_source_content_CS ADLSourceContentAttributes color spaces
+/// @{
+/// defines for iColorSpace in ADLSourceContentAttributes
+#define ADL_CS_sRGB				0x0001      ///< sRGB
+#define ADL_CS_BT601 			0x0002      ///< BT.601
+#define ADL_CS_BT709			0x0004      ///< BT.709
+#define ADL_CS_BT2020			0x0008      ///< BT.2020
+#define ADL_CS_ADOBE			0x0010      ///< Adobe RGB
+#define ADL_CS_P3				0x0020      ///< DCI-P3
+#define ADL_CS_scRGB_MS_REF		0x0040      ///< scRGB (MS Reference)
+#define ADL_CS_DISPLAY_NATIVE	0x0080      ///< Display Native
+#define ADL_CS_APP_CONTROL 		0x0100      ///< Application Controlled
+#define ADL_CS_DOLBYVISION      0x0200      ///< DolbyVision
+/// @}
+
+/// \defgroup define_HDR_support ADLDDCInfo2 HDR support options
+/// @{
+/// defines for iSupportedHDR in ADLDDCInfo2
+#define ADL_HDR_CEA861_3		0x0001      ///< HDR10/CEA861.3 HDR supported
+#define ADL_HDR_DOLBYVISION		0x0002      ///< DolbyVision HDR supported
+#define ADL_HDR_FREESYNC_HDR	0x0004      ///< FreeSync HDR supported
+/// @}
+
+/// \defgroup define_dbd_state Deep Bit Depth
+/// @{
+
+/// defines for ADL_Workstation_DeepBitDepth_Get and  ADL_Workstation_DeepBitDepth_Set functions
+// This value indicates that the deep bit depth state is forced off
+#define ADL_DEEPBITDEPTH_FORCEOFF 	0
+/// This value indicates that the deep bit depth state  is set to auto, the driver will automatically enable the
+/// appropriate deep bit depth state depending on what connected display supports.
+#define ADL_DEEPBITDEPTH_10BPP_AUTO 	1
+/// This value indicates that the deep bit depth state  is forced on to 10 bits per pixel, this is regardless if the display
+/// supports 10 bpp.
+#define ADL_DEEPBITDEPTH_10BPP_FORCEON 	2
+
+/// defines for ADLAdapterConfigMemory of ADL_Adapter_ConfigMemory_Get
+/// If this bit is set, it indicates that the Deep Bit Depth pixel is set on the display
+#define ADL_ADAPTER_CONFIGMEMORY_DBD			(1 << 0)
+/// If this bit is set, it indicates that the display is rotated (90, 180 or 270)
+#define ADL_ADAPTER_CONFIGMEMORY_ROTATE			(1 << 1)
+/// If this bit is set, it indicates that passive stereo is set on the display
+#define ADL_ADAPTER_CONFIGMEMORY_STEREO_PASSIVE	(1 << 2)
+/// If this bit is set, it indicates that the active stereo is set on the display
+#define ADL_ADAPTER_CONFIGMEMORY_STEREO_ACTIVE	(1 << 3)
+/// If this bit is set, it indicates that the tear free vsync is set on the display
+#define ADL_ADAPTER_CONFIGMEMORY_ENHANCEDVSYNC	(1 << 4)
+#define ADL_ADAPTER_CONFIGMEMORY_TEARFREEVSYNC	(1 << 4)
+/// @}
+
+/// \defgroup define_adl_validmemoryrequiredfields Memory Type
+/// @{
+
+///  This group defines memory types in ADLMemoryRequired struct \n
+/// Indicates that this is the visible memory
+#define ADL_MEMORYREQTYPE_VISIBLE				(1 << 0)
+/// Indicates that this is the invisible memory.
+#define ADL_MEMORYREQTYPE_INVISIBLE				(1 << 1)
+/// Indicates that this is amount of visible memory per GPU that should be reserved for all other allocations.
+#define ADL_MEMORYREQTYPE_GPURESERVEDVISIBLE	(1 << 2)
+/// @}
+
+/// \defgroup define_adapter_tear_free_status
+/// Used in ADL_Adapter_TEAR_FREE_Set and ADL_Adapter_TFD_Get functions to indicate the tear free
+/// desktop status.
+/// @{
+/// Tear free desktop is enabled.
+#define ADL_ADAPTER_TEAR_FREE_ON				1
+/// Tear free desktop can't be enabled due to a lack of graphic adapter memory.
+#define ADL_ADAPTER_TEAR_FREE_NOTENOUGHMEM		-1
+/// Tear free desktop can't be enabled due to quad buffer stereo being enabled.
+#define ADL_ADAPTER_TEAR_FREE_OFF_ERR_QUADBUFFERSTEREO	-2
+/// Tear free desktop can't be enabled due to MGPU-SLS being enabled.
+#define ADL_ADAPTER_TEAR_FREE_OFF_ERR_MGPUSLD	-3
+/// Tear free desktop is disabled.
+#define ADL_ADAPTER_TEAR_FREE_OFF				0
+/// @}
+
+/// \defgroup define_adapter_crossdisplay_platforminfo
+/// Used in ADL_Adapter_CrossDisplayPlatformInfo_Get function to indicate the Crossdisplay platform info.
+/// @{
+/// CROSSDISPLAY platform.
+#define ADL_CROSSDISPLAY_PLATFORM					(1 << 0)
+/// CROSSDISPLAY platform for Lasso station.
+#define ADL_CROSSDISPLAY_PLATFORM_LASSO				(1 << 1)
+/// CROSSDISPLAY platform for docking station.
+#define ADL_CROSSDISPLAY_PLATFORM_DOCKSTATION		(1 << 2)
+/// @}
+
+/// \defgroup define_adapter_crossdisplay_option
+/// Used in ADL_Adapter_CrossdisplayInfoX2_Set function to indicate cross display options.
+/// @{
+/// Checking if 3D application is runnning. If yes, not to do switch, return ADL_OK_WAIT; otherwise do switch.
+#define ADL_CROSSDISPLAY_OPTION_NONE			0
+/// Force switching without checking for running 3D applications
+#define ADL_CROSSDISPLAY_OPTION_FORCESWITCH		(1 << 0)
+/// @}
+
+/// \defgroup define_adapter_states Adapter Capabilities
+/// These defines the capabilities supported by an adapter. It is used by \ref ADL_Adapter_ConfigureState_Get
+/// @{
+/// Indicates that the adapter is headless (i.e. no displays can be connected to it)
+#define ADL_ADAPTERCONFIGSTATE_HEADLESS ( 1 << 2 )
+/// Indicates that the adapter is configured to define the main rendering capabilities. For example, adapters
+/// in Crossfire(TM) configuration, this bit would only be set on the adapter driving the display(s).
+#define ADL_ADAPTERCONFIGSTATE_REQUISITE_RENDER ( 1 << 0 )
+/// Indicates that the adapter is configured to be used to unload some of the rendering work for a particular
+/// requisite rendering adapter. For eample, for adapters in a Crossfire configuration, this bit would be set
+/// on all adapters that are currently not driving the display(s)
+#define ADL_ADAPTERCONFIGSTATE_ANCILLARY_RENDER ( 1 << 1 )
+/// Indicates that scatter gather feature enabled on the adapter
+#define ADL_ADAPTERCONFIGSTATE_SCATTERGATHER ( 1 << 4 )
+/// @}
+
+/// \defgroup define_controllermode_ulModifiers
+/// These defines the detailed actions supported by set viewport. It is used by \ref ADL_Display_ViewPort_Set
+/// @{
+/// Indicate that the viewport set will change the view position
+#define ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_POSITION       0x00000001
+/// Indicate that the viewport set will change the view PanLock
+#define ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_PANLOCK        0x00000002
+/// Indicate that the viewport set will change the view size
+#define ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_SIZE           0x00000008
+/// @}
+
+/// \defgroup defines for Mirabilis
+/// These defines are used for the Mirabilis feature
+/// @{
+///
+/// Indicates the maximum number of audio sample rates
+#define ADL_MAX_AUDIO_SAMPLE_RATE_COUNT                    16
+/// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADLMultiChannelSplitStateFlag Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLMultiChannelSplitStateFlag
+{
+    ADLMultiChannelSplit_Unitialized = 0,
+    ADLMultiChannelSplit_Disabled    = 1,
+    ADLMultiChannelSplit_Enabled     = 2,
+    ADLMultiChannelSplit_SaveProfile = 3
+};
+
+///////////////////////////////////////////////////////////////////////////
+// ADLSampleRate Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLSampleRate
+{
+    ADLSampleRate_32KHz =0,
+    ADLSampleRate_44P1KHz,
+    ADLSampleRate_48KHz,
+    ADLSampleRate_88P2KHz,
+    ADLSampleRate_96KHz,
+    ADLSampleRate_176P4KHz,
+    ADLSampleRate_192KHz,
+    ADLSampleRate_384KHz, //DP1.2
+    ADLSampleRate_768KHz, //DP1.2
+    ADLSampleRate_Undefined
+};
+
+/// \defgroup define_overdrive6_capabilities
+/// These defines the capabilities supported by Overdrive 6. It is used by \ref ADL_Overdrive6_Capabilities_Get
+// @{
+/// Indicate that core (engine) clock can be changed.
+#define ADL_OD6_CAPABILITY_SCLK_CUSTOMIZATION               0x00000001
+/// Indicate that memory clock can be changed.
+#define ADL_OD6_CAPABILITY_MCLK_CUSTOMIZATION               0x00000002
+/// Indicate that graphics activity reporting is supported.
+#define ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR             0x00000004
+/// Indicate that power limit can be customized.
+#define ADL_OD6_CAPABILITY_POWER_CONTROL                    0x00000008
+/// Indicate that SVI2 Voltage Control is supported.
+#define ADL_OD6_CAPABILITY_VOLTAGE_CONTROL                  0x00000010
+/// Indicate that OD6+ percentage adjustment is supported.
+#define ADL_OD6_CAPABILITY_PERCENT_ADJUSTMENT               0x00000020
+/// Indicate that Thermal Limit Unlock is supported.
+#define ADL_OD6_CAPABILITY_THERMAL_LIMIT_UNLOCK             0x00000040
+///Indicate that Fan speed needs to be displayed in RPM
+#define ADL_OD6_CAPABILITY_FANSPEED_IN_RPM					0x00000080
+// @}
+
+/// \defgroup define_overdrive6_supported_states
+/// These defines the power states supported by Overdrive 6. It is used by \ref ADL_Overdrive6_Capabilities_Get
+// @{
+/// Indicate that overdrive is supported in the performance state.  This is currently the only state supported.
+#define ADL_OD6_SUPPORTEDSTATE_PERFORMANCE                  0x00000001
+/// Do not use.  Reserved for future use.
+#define ADL_OD6_SUPPORTEDSTATE_POWER_SAVING                 0x00000002
+// @}
+
+/// \defgroup define_overdrive6_getstateinfo
+/// These defines the power states to get information about. It is used by \ref ADL_Overdrive6_StateInfo_Get
+// @{
+/// Get default clocks for the performance state.
+#define ADL_OD6_GETSTATEINFO_DEFAULT_PERFORMANCE            0x00000001
+/// Do not use.  Reserved for future use.
+#define ADL_OD6_GETSTATEINFO_DEFAULT_POWER_SAVING           0x00000002
+/// Get clocks for current state.  Currently this is the same as \ref ADL_OD6_GETSTATEINFO_CUSTOM_PERFORMANCE
+/// since only performance state is supported.
+#define ADL_OD6_GETSTATEINFO_CURRENT                        0x00000003
+/// Get the modified clocks (if any) for the performance state.  If clocks were not modified
+/// through Overdrive 6, then this will return the same clocks as \ref ADL_OD6_GETSTATEINFO_DEFAULT_PERFORMANCE.
+#define ADL_OD6_GETSTATEINFO_CUSTOM_PERFORMANCE             0x00000004
+/// Do not use.  Reserved for future use.
+#define ADL_OD6_GETSTATEINFO_CUSTOM_POWER_SAVING            0x00000005
+// @}
+
+/// \defgroup define_overdrive6_getstate and define_overdrive6_getmaxclockadjust
+/// These defines the power states to get information about. It is used by \ref ADL_Overdrive6_StateEx_Get and \ref ADL_Overdrive6_MaxClockAdjust_Get
+// @{
+/// Get default clocks for the performance state.  Only performance state is currently supported.
+#define ADL_OD6_STATE_PERFORMANCE            0x00000001
+// @}
+
+/// \defgroup define_overdrive6_setstate
+/// These define which power state to set customized clocks on. It is used by \ref ADL_Overdrive6_State_Set
+// @{
+/// Set customized clocks for the performance state.
+#define ADL_OD6_SETSTATE_PERFORMANCE                        0x00000001
+/// Do not use.  Reserved for future use.
+#define ADL_OD6_SETSTATE_POWER_SAVING                       0x00000002
+// @}
+
+/// \defgroup define_overdrive6_thermalcontroller_caps
+/// These defines the capabilities of the GPU thermal controller. It is used by \ref ADL_Overdrive6_ThermalController_Caps
+// @{
+/// GPU thermal controller is supported.
+#define ADL_OD6_TCCAPS_THERMAL_CONTROLLER                   0x00000001
+/// GPU fan speed control is supported.
+#define ADL_OD6_TCCAPS_FANSPEED_CONTROL                     0x00000002
+/// Fan speed percentage can be read.
+#define ADL_OD6_TCCAPS_FANSPEED_PERCENT_READ                0x00000100
+/// Fan speed can be set by specifying a percentage value.
+#define ADL_OD6_TCCAPS_FANSPEED_PERCENT_WRITE               0x00000200
+/// Fan speed RPM (revolutions-per-minute) can be read.
+#define ADL_OD6_TCCAPS_FANSPEED_RPM_READ                    0x00000400
+/// Fan speed can be set by specifying an RPM value.
+#define ADL_OD6_TCCAPS_FANSPEED_RPM_WRITE                   0x00000800
+// @}
+
+/// \defgroup define_overdrive6_fanspeed_type
+/// These defines the fan speed type being reported. It is used by \ref ADL_Overdrive6_FanSpeed_Get
+// @{
+/// Fan speed reported in percentage.
+#define ADL_OD6_FANSPEED_TYPE_PERCENT                       0x00000001
+/// Fan speed reported in RPM.
+#define ADL_OD6_FANSPEED_TYPE_RPM                           0x00000002
+/// Fan speed has been customized by the user, and fan is not running in automatic mode.
+#define ADL_OD6_FANSPEED_USER_DEFINED                       0x00000100
+// @}
+
+/// \defgroup define_overdrive_EventCounter_type
+/// These defines the EventCounter type being reported. It is used by \ref ADL2_OverdriveN_CountOfEvents_Get ,can be used on older OD version supported ASICs also.
+// @{
+#define ADL_ODN_EVENTCOUNTER_THERMAL        0
+#define ADL_ODN_EVENTCOUNTER_VPURECOVERY    1
+// @}
+
+///////////////////////////////////////////////////////////////////////////
+// ADLODNControlType Enumeration
+///////////////////////////////////////////////////////////////////////////
+enum ADLODNControlType
+{
+	ODNControlType_Current = 0,
+	ODNControlType_Default,
+	ODNControlType_Auto,
+    ODNControlType_Manual
+};
+
+/// \defgroup define_ecc_mode_states
+/// These defines the ECC(Error Correction Code) state. It is used by \ref ADL_Workstation_ECC_Get,ADL_Workstation_ECC_Set
+// @{
+/// Error Correction is disabled.
+#define ECC_MODE_OFF 0
+/// Error Correction is enabled.
+#define ECC_MODE_ON 2
+// @}
+
+/// \defgroup define_board_layout_flags
+/// These defines are the board layout flags state which indicates what are the valid properties of \ref ADLBoardLayoutInfo . It is used by \ref ADL_Adapter_BoardLayout_Get
+// @{
+/// Indicates the number of slots is valid.
+#define ADL_BLAYOUT_VALID_NUMBER_OF_SLOTS 0x1
+/// Indicates the slot sizes are valid. Size of the slot consists of the length and width.
+#define ADL_BLAYOUT_VALID_SLOT_SIZES 0x2
+/// Indicates the connector offsets are valid.
+#define ADL_BLAYOUT_VALID_CONNECTOR_OFFSETS 0x4
+/// Indicates the connector lengths is valid.
+#define ADL_BLAYOUT_VALID_CONNECTOR_LENGTHS 0x8
+// @}
+
+/// \defgroup define_max_constants
+/// These defines are the maximum value constants.
+// @{
+/// Indicates the Maximum supported slots on board.
+#define ADL_ADAPTER_MAX_SLOTS 4
+/// Indicates the Maximum supported connectors on slot.
+#define ADL_ADAPTER_MAX_CONNECTORS 10
+/// Indicates the Maximum supported properties of connection
+#define ADL_MAX_CONNECTION_TYPES 32
+/// Indicates the Maximum relative address link count.
+#define ADL_MAX_RELATIVE_ADDRESS_LINK_COUNT 15
+/// Indicates the Maximum size of EDID data block size
+#define ADL_MAX_DISPLAY_EDID_DATA_SIZE 1024
+/// Indicates the Maximum count of Error Records.
+#define ADL_MAX_ERROR_RECORDS_COUNT  256
+/// Indicates the maximum number of power states supported
+#define ADL_MAX_POWER_POLICY    6
+// @}
+
+/// \defgroup define_connection_types
+/// These defines are the connection types constants which indicates  what are the valid connection type of given connector. It is used by \ref ADL_Adapter_SupportedConnections_Get
+// @{
+/// Indicates the VGA connection type is valid.
+#define ADL_CONNECTION_TYPE_VGA 0
+/// Indicates the DVI_I connection type is valid.
+#define ADL_CONNECTION_TYPE_DVI 1
+/// Indicates the DVI_SL connection type is valid.
+#define ADL_CONNECTION_TYPE_DVI_SL 2
+/// Indicates the HDMI connection type is valid.
+#define ADL_CONNECTION_TYPE_HDMI 3
+/// Indicates the DISPLAY PORT connection type is valid.
+#define ADL_CONNECTION_TYPE_DISPLAY_PORT 4
+/// Indicates the Active dongle DP->DVI(single link) connection type is valid.
+#define ADL_CONNECTION_TYPE_ACTIVE_DONGLE_DP_DVI_SL 5
+/// Indicates the Active dongle DP->DVI(double link) connection type is valid.
+#define ADL_CONNECTION_TYPE_ACTIVE_DONGLE_DP_DVI_DL 6
+/// Indicates the Active dongle DP->HDMI connection type is valid.
+#define ADL_CONNECTION_TYPE_ACTIVE_DONGLE_DP_HDMI 7
+/// Indicates the Active dongle DP->VGA connection type is valid.
+#define ADL_CONNECTION_TYPE_ACTIVE_DONGLE_DP_VGA 8
+/// Indicates the Passive dongle DP->HDMI connection type is valid.
+#define ADL_CONNECTION_TYPE_PASSIVE_DONGLE_DP_HDMI 9
+/// Indicates the Active dongle DP->VGA connection type is valid.
+#define ADL_CONNECTION_TYPE_PASSIVE_DONGLE_DP_DVI 10
+/// Indicates the MST type is valid.
+#define ADL_CONNECTION_TYPE_MST 11
+/// Indicates the active dongle, all types.
+#define ADL_CONNECTION_TYPE_ACTIVE_DONGLE          12
+/// Indicates the Virtual Connection Type.
+#define ADL_CONNECTION_TYPE_VIRTUAL	13
+/// Macros for generating bitmask from index.
+#define ADL_CONNECTION_BITMAST_FROM_INDEX(index) (1 << index)
+// @}
+
+/// \defgroup define_connection_properties
+/// These defines are the connection properties which indicates what are the valid properties of given connection type. It is used by \ref ADL_Adapter_SupportedConnections_Get
+// @{
+/// Indicates the property Bitrate is valid.
+#define ADL_CONNECTION_PROPERTY_BITRATE 0x1
+/// Indicates the property number of lanes is valid.
+#define ADL_CONNECTION_PROPERTY_NUMBER_OF_LANES 0x2
+/// Indicates the property 3D caps is valid.
+#define ADL_CONNECTION_PROPERTY_3DCAPS  0x4
+/// Indicates the property output bandwidth is valid.
+#define ADL_CONNECTION_PROPERTY_OUTPUT_BANDWIDTH 0x8
+/// Indicates the property colordepth is valid.
+#define ADL_CONNECTION_PROPERTY_COLORDEPTH  0x10
+// @}
+
+/// \defgroup define_lanecount_constants
+/// These defines are the Lane count constants which will be used in DP & etc.
+// @{
+/// Indicates if lane count is unknown
+#define ADL_LANECOUNT_UNKNOWN 0
+/// Indicates if lane count is 1
+#define ADL_LANECOUNT_ONE 1
+/// Indicates if lane count is 2
+#define ADL_LANECOUNT_TWO 2
+/// Indicates if lane count is 4
+#define ADL_LANECOUNT_FOUR 4
+/// Indicates if lane count is 8
+#define ADL_LANECOUNT_EIGHT 8
+/// Indicates default value of lane count
+#define ADL_LANECOUNT_DEF ADL_LANECOUNT_FOUR
+// @}
+
+/// \defgroup define_linkrate_constants
+/// These defines are the link rate constants which will be used in DP & etc.
+// @{
+/// Indicates if link rate is unknown
+#define ADL_LINK_BITRATE_UNKNOWN 0
+/// Indicates if link rate is 1.62Ghz
+#define ADL_LINK_BITRATE_1_62_GHZ 0x06
+/// Indicates if link rate is 2.7Ghz
+#define ADL_LINK_BITRATE_2_7_GHZ 0x0A
+/// Indicates if link rate is 3.24Ghz
+#define ADL_LINK_BTIRATE_3_24_GHZ 0x0C
+/// Indicates if link rate is 5.4Ghz
+#define ADL_LINK_BITRATE_5_4_GHZ 0x14
+/// Indicates default value of link rate
+#define ADL_LINK_BITRATE_DEF ADL_LINK_BITRATE_2_7_GHZ
+// @}
+
+/// \defgroup define_colordepth_constants
+/// These defines are the color depth constants which will be used in DP & etc.
+// @{
+#define ADL_CONNPROP_S3D_ALTERNATE_TO_FRAME_PACK            0x00000001
+// @}
+
+
+/// \defgroup define_colordepth_constants
+/// These defines are the color depth constants which will be used in DP & etc.
+// @{
+/// Indicates if color depth is unknown
+#define ADL_COLORDEPTH_UNKNOWN 0
+/// Indicates if color depth is 666
+#define ADL_COLORDEPTH_666 1
+/// Indicates if color depth is 888
+#define ADL_COLORDEPTH_888 2
+/// Indicates if color depth is 101010
+#define ADL_COLORDEPTH_101010 3
+/// Indicates if color depth is 121212
+#define ADL_COLORDEPTH_121212 4
+/// Indicates if color depth is 141414
+#define ADL_COLORDEPTH_141414 5
+/// Indicates if color depth is 161616
+#define ADL_COLORDEPTH_161616 6
+/// Indicates default value of color depth
+#define ADL_COLOR_DEPTH_DEF ADL_COLORDEPTH_888
+// @}
+
+
+/// \defgroup define_emulation_status
+/// These defines are the status of emulation
+// @{
+/// Indicates if real device is connected.
+#define ADL_EMUL_STATUS_REAL_DEVICE_CONNECTED 0x1
+/// Indicates if emulated device is presented.
+#define ADL_EMUL_STATUS_EMULATED_DEVICE_PRESENT 0x2
+/// Indicates if emulated device is used.
+#define ADL_EMUL_STATUS_EMULATED_DEVICE_USED  0x4
+/// In case when last active real/emulated device used (when persistence is enabled but no emulation enforced then persistence will use last connected/emulated device).
+#define ADL_EMUL_STATUS_LAST_ACTIVE_DEVICE_USED 0x8
+// @}
+
+/// \defgroup define_emulation_mode
+/// These defines are the modes of emulation
+// @{
+/// Indicates if no emulation is used
+#define ADL_EMUL_MODE_OFF 0
+/// Indicates if emulation is used when display connected
+#define ADL_EMUL_MODE_ON_CONNECTED 1
+/// Indicates if emulation is used when display dis connected
+#define ADL_EMUL_MODE_ON_DISCONNECTED 2
+/// Indicates if emulation is used always
+#define ADL_EMUL_MODE_ALWAYS 3
+// @}
+
+/// \defgroup define_emulation_query
+/// These defines are the modes of emulation
+// @{
+/// Indicates Data from real device
+#define ADL_QUERY_REAL_DATA 0
+/// Indicates Emulated data
+#define ADL_QUERY_EMULATED_DATA 1
+/// Indicates Data currently in use
+#define ADL_QUERY_CURRENT_DATA 2
+// @}
+
+/// \defgroup define_persistence_state
+/// These defines are the states of persistence
+// @{
+/// Indicates persistence is disabled
+#define ADL_EDID_PERSISTANCE_DISABLED 0
+/// Indicates persistence is enabled
+#define ADL_EDID_PERSISTANCE_ENABLED 1
+// @}
+
+/// \defgroup define_connector_types Connector Type
+/// defines for ADLConnectorInfo.iType
+// @{
+/// Indicates unknown Connector type
+#define ADL_CONNECTOR_TYPE_UNKNOWN                 0
+/// Indicates VGA Connector type
+#define ADL_CONNECTOR_TYPE_VGA                     1
+/// Indicates DVI-D Connector type
+#define ADL_CONNECTOR_TYPE_DVI_D                   2
+/// Indicates DVI-I Connector type
+#define ADL_CONNECTOR_TYPE_DVI_I                   3
+/// Indicates Active Dongle-NA Connector type
+#define ADL_CONNECTOR_TYPE_ATICVDONGLE_NA          4
+/// Indicates Active Dongle-JP Connector type
+#define ADL_CONNECTOR_TYPE_ATICVDONGLE_JP          5
+/// Indicates Active Dongle-NONI2C Connector type
+#define ADL_CONNECTOR_TYPE_ATICVDONGLE_NONI2C      6
+/// Indicates Active Dongle-NONI2C-D Connector type
+#define ADL_CONNECTOR_TYPE_ATICVDONGLE_NONI2C_D    7
+/// Indicates HDMI-Type A Connector type
+#define ADL_CONNECTOR_TYPE_HDMI_TYPE_A             8
+/// Indicates HDMI-Type B Connector type
+#define ADL_CONNECTOR_TYPE_HDMI_TYPE_B             9
+/// Indicates Display port Connector type
+#define ADL_CONNECTOR_TYPE_DISPLAYPORT             10
+/// Indicates EDP Connector type
+#define ADL_CONNECTOR_TYPE_EDP                     11
+/// Indicates MiniDP Connector type
+#define ADL_CONNECTOR_TYPE_MINI_DISPLAYPORT        12
+/// Indicates Virtual Connector type
+#define ADL_CONNECTOR_TYPE_VIRTUAL			       13
+// @}
+
+/// \defgroup define_freesync_usecase
+/// These defines are to specify use cases in which FreeSync should be enabled
+/// They are a bit mask. To specify FreeSync for more than one use case, the input value
+/// should be set to include multiple bits set
+// @{
+/// Indicates FreeSync is enabled for Static Screen case
+#define ADL_FREESYNC_USECASE_STATIC                 0x1
+/// Indicates FreeSync is enabled for Video use case
+#define ADL_FREESYNC_USECASE_VIDEO                  0x2
+/// Indicates FreeSync is enabled for Gaming use case
+#define ADL_FREESYNC_USECASE_GAMING                 0x4
+// @}
+
+/// \defgroup define_freesync_caps
+/// These defines are used to retrieve FreeSync display capabilities.
+/// GPU support flag also indicates whether the display is
+/// connected to a GPU that actually supports FreeSync
+// @{
+#define ADL_FREESYNC_CAP_SUPPORTED                      (1 << 0)
+#define ADL_FREESYNC_CAP_GPUSUPPORTED                   (1 << 1)
+#define ADL_FREESYNC_CAP_DISPLAYSUPPORTED               (1 << 2)
+#define ADL_FREESYNC_CAP_CURRENTMODESUPPORTED           (1 << 3)
+#define ADL_FREESYNC_CAP_NOCFXORCFXSUPPORTED            (1 << 4)
+#define ADL_FREESYNC_CAP_NOGENLOCKORGENLOCKSUPPORTED    (1 << 5)
+// @}
+
+
+/// \defgroup define_MST_CommandLine_execute
+// @{
+/// Indicates the MST command line for branch message if the bit is set. Otherwise, it is display message
+#define ADL_MST_COMMANDLINE_PATH_MSG                 0x1
+/// Indicates the MST command line to send message in broadcast way it the bit is set
+#define ADL_MST_COMMANDLINE_BROADCAST                  0x2
+
+// @}
+
+
+/// \defgroup define_Adapter_CloneTypes_Get
+// @{
+/// Indicates there is crossGPU clone with non-AMD dispalys
+#define ADL_CROSSGPUDISPLAYCLONE_AMD_WITH_NONAMD                 0x1
+/// Indicates there is crossGPU clone
+#define ADL_CROSSGPUDISPLAYCLONE                  0x2
+
+// @}
+
+
+
+// End Bracket for Constants and Definitions. Add new groups ABOVE this line!
+
+// @}
+
+#endif /* ADL_DEFINES_H_ */
+
+
diff --git a/adl/adl_functions.h b/adl/adl_functions.h
new file mode 100644
index 0000000..8b0d69c
--- /dev/null
+++ b/adl/adl_functions.h
@@ -0,0 +1,292 @@
+/*******************************************************************************
+
+ * This program reads HW information from your ATI Radeon card and displays them
+ * You can also change frequencies and voltages.
+
+ * THIS PROGRAM MAY DAMAGE YOUR VIDEO CARD, IF YOU APPLY NONSENSIAL VALUES.
+ * e.g. INCREASING THE VOLTAGES AND FREQUENCIES IN CONJUNCTION WITH LOWERING THE
+ *      FAN SPEED IS NOT ADVISABLE!
+
+ * Copyright(C) Thorsten Gilling (tgilling@web.de)
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+*******************************************************************************/
+
+// ------------------------------------------------------------------------------------------------------------
+// AMD ADL function types from Version 3.0
+// ------------------------------------------------------------------------------------------------------------
+
+#if defined (__unix__)
+#	include <dlfcn.h>	//dyopen, dlsym, dlclose
+#	include <stdlib.h>
+#	include <string.h>	//memeset
+#else
+#	include <windows.h>
+#	include <tchar.h>
+#endif
+
+#include "adl_sdk.h"
+
+// Definitions of the used function pointers. Add more if you use other ADL APIs
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL Main
+typedef int ( *ADL_MAIN_CONTROL_CREATE ) (ADL_MAIN_MALLOC_CALLBACK callback, int iEnumConnectedAdapters);
+typedef int ( *ADL_MAIN_CONTROL_REFRESH ) ();
+typedef int ( *ADL_MAIN_CONTROL_DESTROY ) ();
+typedef int ( *ADL_GRAPHICS_PLATFORM_GET ) (int *lpPlatForm);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL Adapter/General
+typedef int ( *ADL_ADAPTER_ACTIVE_GET ) (int iAdapterIndex, int *lpStatus);
+typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) (int *lpNumAdapters);
+typedef int ( *ADL_ADAPTER_ADAPTERINFO_GET ) (LPAdapterInfo lpInfo, int iInputSize);
+typedef int ( *ADL_ADAPTER_ASICFAMILYTYPE_GET ) (int iAdapterIndex, int *lpAsicTypes, int *lpValids);
+typedef int ( *ADL_ADAPTER_SPEED_CAPS )	(int iAdapterIndex, int *lpCaps, int *lpValid);
+typedef int ( *ADL_ADAPTER_SPEED_GET ) (int iAdapterIndex, int *lpCurrent, int *lpDefault);
+typedef int ( *ADL_ADAPTER_SPEED_SET ) (int iAdapterIndex, int iSpeed);
+typedef int ( *ADL_ADAPTER_ACCESSIBILITY_GET ) (int iAdapterIndex, int *lpAccessibility);
+typedef int ( *ADL_ADAPTER_VIDEOBIOSINFO_GET ) (int iAdapterIndex, ADLBiosInfo *lpBiosInfo);
+typedef int ( *ADL_ADAPTER_ID_GET ) (int iAdapterIndex, int *lpAdapterID);
+
+// ADL Adapter/CrossDisplay
+typedef int ( *ADL_ADAPTER_CROSSDISPLAYADAPTERROLE_CAPS ) (int iAdapterIndex, int *lpCrossDisplaySupport, int *lpAdapterRole, int *lpNumPossDisplayAdapters, int **lppPossDisplayAdapters, int *lpNnumPosRenderingAdapters, int **lppPosRenderingAdapters, int *lpErrorStatus);
+typedef int ( *ADL_ADAPTER_CROSSDISPLAYINFO_GET ) (int iAdapterIndex, int *lpAdapterRole, int *lpCrossdisplayMode, int *lpNumDisplayAdapters, int **lppDisplayAdapters, int *lpNumRenderingAdapters, int **lppRenderingAdapters, int *lpErrorCodeStatus);
+typedef int ( *ADL_ADAPTER_CROSSDISPLAYINFO_SET ) (int iAdapterIndex, int iDisplayAdapterIndex, int iRenderingAdapterIndex, int crossdisplayMode, int *lpErrorCode);
+
+// ADL Adapter/CrossFire
+typedef int ( *ADL_ADAPTER_CROSSFIRE_CAPS ) (int iAdapterIndex, int *lpPreferred, int *lpNumComb, ADLCrossfireComb **ppCrossfireComb);
+typedef int ( *ADL_ADAPTER_CROSSFIRE_GET ) (int iAdapterIndex, ADLCrossfireComb *lpCrossfireComb, ADLCrossfireInfo *lpCrossfireInfo);
+typedef int ( *ADL_ADAPTER_CROSSFIRE_SET ) (int iAdapterIndex, ADLCrossfireComb *lpCrossfireComb);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL Display/Misc
+
+typedef int ( *ADL_DISPLAY_DISPLAYINFO_GET ) (int iAdapterIndex, int *lpNumDisplays, ADLDisplayInfo **lppInfo, int iForceDetect);
+typedef int ( *ADL_DISPLAY_NUMBEROFDISPLAYS_GET ) (int iAdapterIndex, int *lpNumDisplays);
+typedef int ( *ADL_DISPLAY_PRESERVEDASPECTRATIO_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport, int *lpCurrent, int *lpDefault);
+typedef int ( *ADL_DISPLAY_PRESERVEDASPECTRATIO_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_DISPLAY_IMAGEEXPANSION_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport, int *lpCurrent, int *lpDefault);
+typedef int ( *ADL_DISPLAY_IMAGEEXPANSION_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_DISPLAY_POSITION_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpX, int *lpY, int *lpXDefault, int *lpYDefault, int *lpMinX, int *lpMinY, int *lpMaxX, int *lpMaxY, int *lpStepX, int *lpStepY);
+typedef int ( *ADL_DISPLAY_POSITION_SET ) (int iAdapterIndex, int iDisplayIndex, int iX, int iY);
+typedef int ( *ADL_DISPLAY_SIZE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpWidth, int *lpHeight, int *lpDefaultWidth, int *lpDefaultHeight, int *lpMinWidth, int *lpMinHeight, int *lpMaxWidth, int *lpMaxHeight, int *lpStepWidth, int *lpStepHeight);
+typedef int ( *ADL_DISPLAY_SIZE_SET ) (int iAdapterIndex, int iDisplayIndex, int iWidth, int iHeight);
+typedef int ( *ADL_DISPLAY_ADJUSTCAPS_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpInfo);
+typedef int ( *ADL_DISPLAY_CAPABILITIES_GET ) (int iAdapterIndex, int *lpNumberOfControlers, int *lpNumberOfDisplays);
+typedef int ( *ADL_DISPLAY_CONNECTEDDISPLAYS_GET ) (int iAdapterIndex, int *lpConnections);
+typedef int ( *ADL_DISPLAY_DEVICECONFIG_GET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayConfig *lpDisplayConfig);
+typedef int ( *ADL_DISPLAY_PROPERTY_GET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayProperty *lpDisplayProperty);
+typedef int ( *ADL_DISPLAY_PROPERTY_SET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayProperty *lpDisplayProperty);
+typedef int ( *ADL_DISPLAY_SWITCHINGCAPABILITY_GET ) (int iAdapterIndex, int *lpResult);
+typedef int ( *ADL_DISPLAY_DITHERSTATE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpDitherState);
+typedef int ( *ADL_DISPLAY_DITHERSTATE_SET ) (int iAdapterIndex, int iDisplayIndex, int iDitherState);
+typedef int ( *ADL_DISPLAY_SUPPORTEDPIXELFORMAT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpPixelFormat);
+typedef int ( *ADL_DISPLAY_PIXELFORMAT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpPixelFormat);
+typedef int ( *ADL_DISPLAY_PIXELFORMAT_SET ) (int iAdapterIndex, int iDisplayIndex, int iPixelFormat);
+typedef int ( *ADL_DISPLAY_ODCLOCKINFO_GET ) (int iAdapterIndex, ADLAdapterODClockInfo *lpOdClockInfo);
+typedef int ( *ADL_DISPLAY_ODCLOCKCONFIG_SET ) (int iAdapterIndex, ADLAdapterODClockConfig *lpOdClockConfig);
+typedef int ( *ADL_DISPLAY_ADJUSTMENTCOHERENT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpAdjustmentCoherentCurrent, int *lpAdjustmentCoherentDefault);
+typedef int ( *ADL_DISPLAY_ADJUSTMENTCOHERENT_SET ) (int iAdapterIndex, int iDisplayIndex, int iAdjustmentCoherent);
+typedef int ( *ADL_DISPLAY_REDUCEDBLANKING_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpReducedBlankingCurrent, int *lpReducedBlankingDefault);
+typedef int ( *ADL_DISPLAY_REDUCEDBLANKING_SET ) (int iAdapterIndex, int iDisplayIndex, int iReducedBlanking);
+typedef int ( *ADL_DISPLAY_FORMATSOVERRIDE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSettingsSupported, int *lpSettingsSupportedEx, int *lpCurSettings);
+typedef int ( *ADL_DISPLAY_FORMATSOVERRIDE_SET ) (int iAdapterIndex, int iDisplayIndex, int iOverrideSettings);
+typedef int ( *ADL_DISPLAY_MVPUCAPS_GET ) (int iAdapterIndex, ADLMVPUCaps *lpMvpuCaps);
+typedef int ( *ADL_DISPLAY_MVPUSTATUS_GET ) (int iAdapterIndex, ADLMVPUStatus *lpMvpuStatus);
+
+// ADL Display/Eyefinity
+typedef int ( *ADL_ADAPTER_ACTIVE_SET ) (int iAdapterIndex, int iStatus, int *lpNewlyActivate);
+typedef int ( *ADL_ADAPTER_ACTIVE_SETPREFER ) (int iAdapterIndex, int iStatus, int iNumPreferTarget, ADLDisplayTarget *lpPreferTarget, int *lpNewlyActivate);
+typedef int ( *ADL_ADAPTER_PRIMARY_GET ) (int *lpPrimaryAdapterIndex);
+typedef int ( *ADL_ADAPTER_PRIMARY_SET ) (int iAdapterIndex);
+typedef int ( *ADL_ADAPTER_MODESWITCH ) (int iAdapterIndex);
+typedef int ( *ADL_DISPLAY_MODES_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpNumModes, ADLMode **lppModes);
+typedef int ( *ADL_DISPLAY_MODES_SET ) (int iAdapterIndex, int iDisplayIndex, int iNumModes, ADLMode *lpModes);
+typedef int ( *ADL_DISPLAY_POSSIBLEMODE_GET ) (int iAdapterIndex, int *lpNumModes, ADLMode **lppModes);
+typedef int ( *ADL_DISPLAY_FORCIBLEDISPLAY_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpStatus);
+typedef int ( *ADL_DISPLAY_FORCIBLEDISPLAY_SET ) (int iAdapterIndex, int iDisplayIndex, int iStatus);
+typedef int ( *ADL_ADAPTER_NUMBEROFACTIVATABLESOURCES_GET ) (int iAdapterIndex, int *lpNumSources, ADLActivatableSource **lppSources);
+typedef int ( *ADL_ADAPTER_DISPLAY_CAPS ) (int iAdapterIndex, int *lpNumDisplayCaps, ADLAdapterDisplayCap **lppAdapterDisplayCaps);
+typedef int ( *ADL_DISPLAY_DISPLAYMAPCONFIG_GET ) (int iAdapterIndex, int *lpNumDisplayMap, ADLDisplayMap **lppDisplayMap, int *lpNumDisplayTarget, ADLDisplayTarget **lppDisplayTarget, int iOptions);
+typedef int ( *ADL_DISPLAY_DISPLAYMAPCONFIG_SET ) (int iAdapterIndex, int iNumDisplayMap, ADLDisplayMap *lpDisplayMap, int iNumDisplayTarget, ADLDisplayTarget *lpDisplayTarget);
+typedef int ( *ADL_DISPLAY_POSSIBLEMAPPING_GET ) (int iAdapterIndex, int iNumberOfPresetMapping, ADLPossibleMapping *lpPresetMappings, int iEnquiryControllerIndex, int *lpNumberOfEnquiryPossibleMappings, ADLPossibleMapping **lppEnquiryPossibleMappings);
+typedef int ( *ADL_DISPLAY_DISPLAYMAPCONFIG_VALIDATE ) (int iAdapterIndex, int iNumPossibleMap, ADLPossibleMap *lpPossibleMaps, int *lpNumPossibleMapResult, ADLPossibleMapResult **lppPossibleMapResult);
+typedef int ( *ADL_DISPLAY_DISPLAYMAPCONFIG_POSSIBLEADDANDREMOVE ) (int iAdapterIndex, int iNumDisplayMap, ADLDisplayMap *lpDisplayMap, int iNumDisplayTarget, ADLDisplayTarget *lpDisplayTarget, int *lpNumPossibleAddTarget, ADLDisplayTarget **lppPossibleAddTarget, int *lpNumPossibleRemoveTarget, ADLDisplayTarget **lppPossibleRemoveTarget);
+typedef int ( *ADL_DISPLAY_SLSGRID_CAPS ) (int iAdapterIndex, int *lpNumSLSGrid, ADLSLSGrid **lppSLSGrid, int iOption);
+typedef int ( *ADL_DISPLAY_SLSMAPINDEXLIST_GET ) (int iAdapterIndex, int *lpNumSLSMapIndexList, int **lppSLSMapIndexList, int iOptions);
+typedef int ( *ADL_DISPLAY_SLSMAPINDEX_GET ) (int iAdapterIndex, int iADLNumDisplayTarget, ADLDisplayTarget *lpDisplayTarget, int *lpSLSMapIndex);
+typedef int ( *ADL_DISPLAY_SLSMAPCONFIG_GET ) (int iAdapterIndex, int iSLSMapIndex, ADLSLSMap *lpSLSMap, int *lpNumSLSTarget, ADLSLSTarget **lppSLSTarget, int *lpNumNativeMode, ADLSLSMode **lppNativeMode, int *lpNumBezelMode, ADLBezelTransientMode **lppBezelMode, int *lpNumTransientMode, ADLBezelTransientMode **lppTransientMode, int *lpNumSLSOffset, ADLSLSOffset **lppSLSOffset, int iOption);
+typedef int ( *ADL_DISPLAY_SLSMAPCONFIG_CREATE ) (int iAdapterIndex, ADLSLSMap SLSMap, int iNumTargetTarget, ADLSLSTarget *lpSLSTarget, int iBezelModePercent, int *lpSLSMapIndex, int iOption);
+typedef int ( *ADL_DISPLAY_SLSMAPCONFIG_DELETE ) (int iAdapterIndex, int iSLSMapIndex);
+typedef int ( *ADL_DISPLAY_SLSMAPCONFIG_SETSTATE ) (int iAdapterIndex, int iSLSMapIndex, int iState);
+typedef int ( *ADL_DISPLAY_SLSMAPCONFIG_REARRANGE ) (int iAdapterIndex, int iSLSMapIndex, int iNumDisplayTarget, ADLSLSTarget *lpSLSTarget, ADLSLSMap slsMap, int iOption);
+typedef int ( *ADL_DISPLAY_POSSIBLEMODE_WINXP_GET ) (int iAdapterIndex, int iNumDisplayTargets, ADLDisplayTarget *lpDisplayTargets, int iLargeDesktopSupportedType, int iDevicePanningControl, int *lpNumModes, ADLMode **lppModes);
+typedef int ( *ADL_DISPLAY_BEZELOFFSETSTEPPINGSIZE_GET ) (int iAdapterIndex, int *lpNumBezelOffsetSteppingSize, ADLBezelOffsetSteppingSize **lppBezelOffsetSteppingSize);
+typedef int ( *ADL_DISPLAY_BEZELOFFSET_SET ) (int iAdapterIndex, int iSLSMapIndex, int iNumBezelOffset, LPADLSLSOffset lpBezelOffset, ADLSLSMap SLSMap, int iOption);
+typedef int ( *ADL_DISPLAY_BEZELSUPPORTED_VALIDATE ) (int iAdapterIndex, int iNumPossibleSLSMap, LPADLPossibleSLSMap lpPossibleSLSMaps, int *lpNumPossibleSLSMapResult, LPADLPossibleMapResult *lppPossibleMapResult);
+
+// ADL Display/Color
+typedef int ( *ADL_DISPLAY_COLORCAPS_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCaps, int *lpValids);
+typedef int ( *ADL_DISPLAY_COLOR_SET ) (int iAdapterIndex, int iDisplayIndex, int iColorType, int iCurrent);
+typedef int ( *ADL_DISPLAY_COLOR_GET ) (int iAdapterIndex, int iDisplayIndex, int iColorType, int *lpCurrent, int *lpDefault, int *lpMin, int *lpMax, int *lpStep);
+typedef int ( *ADL_DISPLAY_COLORTEMPERATURESOURCE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpTempSource);
+typedef int ( *ADL_DISPLAY_COLORTEMPERATURESOURCE_SET ) (int iAdapterIndex, int iDisplayIndex, int iTempSource);
+
+// ADL Display/Timing
+typedef int ( *ADL_DISPLAY_MODETIMINGOVERRIDE_GET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayMode *lpModeIn, ADLDisplayModeInfo *lpModeInfoOut);
+typedef int ( *ADL_DISPLAY_MODETIMINGOVERRIDE_SET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayModeInfo *lpMode, int iForceUpdate);
+typedef int ( *ADL_DISPLAY_MODETIMINGOVERRIDELIST_GET ) (int iAdapterIndex, int iDisplayIndex, int iMaxNumOfOverrides, ADLDisplayModeInfo *lpModeInfoList, int *lpNumOfOverrides);
+
+// ADL Display/Customize
+typedef int ( *ADL_DISPLAY_CUSTOMIZEDMODELISTNUM_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpListNum);
+typedef int ( *ADL_DISPLAY_CUSTOMIZEDMODELIST_GET ) (int iAdapterIndex, int iDisplayIndex, ADLCustomMode *lpCustomModeList, int iBuffSize);
+typedef int ( *ADL_DISPLAY_CUSTOMIZEDMODE_ADD ) (int iAdapterIndex, int iDisplayIndex, ADLCustomMode customMode);
+typedef int ( *ADL_DISPLAY_CUSTOMIZEDMODE_DELETE ) (int iAdapterIndex, int iDisplayIndex, int iIndex);
+typedef int ( *ADL_DISPLAY_CUSTOMIZEDMODE_VALIDATE ) (int iAdapterIndex, int iDisplayIndex, ADLCustomMode customMode, int *lpValid);
+
+// ADL Display/Over-Underscan
+typedef int ( *ADL_DISPLAY_UNDERSCAN_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_DISPLAY_UNDERSCAN_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCurrent, int *lpDefault, int *lpMin, int *lpMax, int *lpStep);
+typedef int ( *ADL_DISPLAY_OVERSCAN_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_DISPLAY_OVERSCAN_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCurrent, int *lpDefualt, int *lpMin, int *lpMax, int *lpStep);
+
+// ADL Display/Overlay
+typedef int ( *ADL_DISPLAY_CONTROLLEROVERLAYADJUSTMENTCAPS_GET ) (int iAdapterIndex, ADLControllerOverlayInput *lpOverlayInput, ADLControllerOverlayInfo *lpCapsInfo);
+typedef int ( *ADL_DISPLAY_CONTROLLEROVERLAYADJUSTMENTDATA_GET ) (int iAdapterIndex, ADLControllerOverlayInput *lpOverlay);
+typedef int ( *ADL_DISPLAY_CONTROLLEROVERLAYADJUSTMENTDATA_SET ) (int iAdapterIndex, ADLControllerOverlayInput *lpOverlay);
+
+// ADL Display/PowerXpress
+typedef int ( *ADL_DISPLAY_POWERXPRESSVERSION_GET ) (int iAdapterIndex, int *lpVersion);
+typedef int ( *ADL_DISPLAY_POWERXPRESSACTIVEGPU_GET ) (int iAdapterIndex, int *lpActiveGPU);
+typedef int ( *ADL_DISPLAY_POWERXPRESSACTIVEGPU_SET ) (int iAdapterIndex, int iActiveGPU, int *lpOperationResult);
+typedef int ( *ADL_DISPLAY_POWERXPRESS_AUTOSWITCHCONFIG_GET ) (int iAdapterIndex, int *lpAutoSwitchOnACDCEvent, int *lpAutoSwitchOnDCACEvent);
+typedef int ( *ADL_DISPLAY_POWERXPRESS_AUTOSWITCHCONFIG_SET ) (int iAdapterIndex, int iAutoSwitchOnACDCEvent, int iAutoSwitchOnDCACEvent);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL DFP
+typedef int ( *ADL_DFP_BASEAUDIOSUPPORT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport);
+typedef int ( *ADL_DFP_HDMISUPPORT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport);
+typedef int ( *ADL_DFP_MVPUANALOGSUPPORT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport);
+typedef int ( *ADL_DFP_PIXELFORMAT_CAPS ) (int iAdapterIndex, int iDisplayIndex, int *lpValidBits, int *lpValidCaps);
+typedef int ( *ADL_DFP_PIXELFORMAT_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCurState, int *lpDefault);
+typedef int ( *ADL_DFP_PIXELFORMAT_SET ) (int iAdapterIndex, int iDisplayIndex, int iState);
+typedef int ( *ADL_DFP_GPUSCALINGENABLE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport, int *lpCurrent, int *lpDefault);
+typedef int ( *ADL_DFP_GPUSCALINGENABLE_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_DFP_ALLOWONLYCETIMINGS_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpSupport, int *lpCurrent, int *lpDefault);
+typedef int ( *ADL_DFP_ALLOWONLYCETIMINGS_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+
+// ADl TV
+typedef int ( *ADL_DISPLAY_TVCAPS_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpcaps);
+typedef int ( *ADL_TV_STANDARD_SET ) (int iAdapterIndex, int iDisplayIndex, int iCurrent);
+typedef int ( *ADL_TV_STANDARD_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCurrent, int *lpDefault, int *lpSupportedStandards);
+
+// ADL Component Video
+typedef int ( *ADL_CV_DONGLESETTINGS_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpDongleSetting, int *lpOverrideSettingsSupported, int *lpCurOverrideSettings);
+typedef int ( *ADL_CV_DONGLESETTINGS_SET ) (int iAdapterIndex, int iDisplayIndex, int iOverrideSettings);
+typedef int ( *ADL_CV_DONGLESETTINGS_RESET ) (int iAdapterIndex, int iDisplayIndex);
+
+// ------------------------------------------------------------------------------------------------------------
+
+typedef int ( *ADL_OVERDRIVE_CAPS ) (int iAdapterIndex, int *iSupported, int *iEnabled, int *iVersion);
+
+// ADL Overdrive 5
+typedef int ( *ADL_OVERDRIVE5_CURRENTACTIVITY_GET ) (int iAdapterIndex, ADLPMActivity *lpActivity);
+typedef int ( *ADL_OVERDRIVE5_THERMALDEVICES_ENUM ) (int iAdapterIndex, int iThermalControllerIndex, ADLThermalControllerInfo *lpThermalControllerInfo);
+typedef int ( *ADL_OVERDRIVE5_TEMPERATURE_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLTemperature *lpTemperature);
+typedef int ( *ADL_OVERDRIVE5_FANSPEEDINFO_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedInfo *lpFanSpeedInfo);
+typedef int ( *ADL_OVERDRIVE5_FANSPEED_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
+typedef int ( *ADL_OVERDRIVE5_FANSPEED_SET ) (int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
+typedef int ( *ADL_OVERDRIVE5_FANSPEEDTODEFAULT_SET ) (int iAdapterIndex, int iThermalControllerIndex);
+typedef int ( *ADL_OVERDRIVE5_ODPARAMETERS_GET ) (int iAdapterIndex, ADLODParameters *lpOdParameters);
+typedef int ( *ADL_OVERDRIVE5_ODPERFORMANCELEVELS_GET ) (int iAdapterIndex, int iDefault, ADLODPerformanceLevels *lpOdPerformanceLevels);
+typedef int ( *ADL_OVERDRIVE5_ODPERFORMANCELEVELS_SET ) (int iAdapterIndex, ADLODPerformanceLevels *lpOdPerformanceLevels);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL Overdrive 6
+typedef int ( *ADL_OVERDRIVE6_CAPABILITIES_GET ) (int iAdapterIndex, ADLOD6Capabilities *lpODCapabilities);
+typedef int	( *ADL_OVERDRIVE6_CURRENTSTATUS_GET )(int iAdapterIndex, ADLOD6CurrentStatus *lpCurrentStatus);
+typedef int ( *ADL_OVERDRIVE6_THERMALCONTROLLER_CAPS )(int iAdapterIndex, ADLOD6ThermalControllerCaps *lpThermalControllerCaps);
+typedef int ( *ADL_OVERDRIVE6_FANSPEED_GET )(int iAdapterIndex, ADLOD6FanSpeedInfo *lpFanSpeedInfo);
+typedef int ( *ADL_OVERDRIVE6_FANSPEED_SET )(int iAdapterIndex, ADLOD6FanSpeedValue *lpFanSpeedValue);
+typedef int ( *ADL_OVERDRIVE6_TEMPERATURE_GET )(int iAdapterIndex, int *lpTemperature);
+typedef int ( *ADL_OVERDRIVE6_STATEINFO_GET )(int iAdapterIndex, int iStateType, ADLOD6StateInfo *lpStateInfo);
+typedef int ( *ADL_OVERDRIVE6_STATE_SET )(int iAdapterIndex, int iStateType, ADLOD6StateInfo *lpStateInfo);
+typedef int ( *ADL_OVERDRIVE6_POWERCONTROL_CAPS ) (int iAdapterIndex, int *lpSupported);
+typedef int ( *ADL_OVERDRIVE6_POWERCONTROLINFO_GET )(int iAdapterIndex, ADLOD6PowerControlInfo *lpPowerControlInfo);
+typedef int ( *ADL_OVERDRIVE6_POWERCONTROL_GET )(int iAdapterIndex, int *lpCurrentValue, int *lpDefaultValue);
+typedef int ( *ADL_OVERDRIVE6_POWERCONTROL_SET )(int iAdapterIndex, int iValue);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL I2C
+typedef int ( *ADL_DISPLAY_WRITEANDREADI2CREV_GET ) (int iAdapterIndex, int *lpMajor, int *lpMinor);
+typedef int ( *ADL_DISPLAY_WRITEANDREADI2C ) (int iAdapterIndex, ADLI2C *plI2C);
+typedef int ( *ADL_DISPLAY_DDCBLOCKACCESS_GET ) (int iAdapterIndex, int iDisplayIndex, int iOption, int iCommandIndex, int iSendMsgLen, char *lpucSendMsgBuf, int *lpulRecvMsgLen, char *lpucRecvMsgBuf);
+typedef int ( *ADL_DISPLAY_DDCINFO_GET ) (int iAdapterIndex, int iDisplayIndex, ADLDDCInfo *lpInfo);
+typedef int ( *ADL_DISPLAY_EDIDDATA_GET ) (int iAdapterIndex, int iDisplayIndex, ADLDisplayEDIDData *lpEDIDData);
+
+// ------------------------------------------------------------------------------------------------------------
+
+// ADL Workstation
+typedef int ( *ADL_WORKSTATION_CAPS ) (int iAdapterIndex, int *lpValidBits, int *lpCaps);
+typedef int ( *ADL_WORKSTATION_STEREO_GET ) (int iAdapterIndex, int *lpDefState, int *lpCurState);
+typedef int ( *ADL_WORKSTATION_STEREO_SET ) (int iAdapterIndex, int iCurState);
+typedef int ( *ADL_WORKSTATION_ADAPTERNUMOFGLSYNCCONNECTORS_GET ) (int iAdapterIndex, int *lpNumOfGLSyncConnectors);
+typedef int ( *ADL_WORKSTATION_DISPLAYGENLOCKCAPABLE_GET ) (int iAdapterIndex, int iDisplayIndex, int *lpCanGenlock);
+typedef int ( *ADL_WORKSTATION_GLSYNCMODULEDETECT_GET ) (int iAdapterIndex, int iGlSyncConnector, ADLGLSyncModuleID *lpGlSyncModuleID);
+typedef int ( *ADL_WORKSTATION_GLSYNCMODULEINFO_GET ) (int iAdapterIndex, int iGlSyncConnector, int *lpNumGLSyncGPUPorts, int *lpNumGlSyncPorts, int *lpMaxSyncDelay, int *lpMaxSampleRate, ADLGLSyncPortCaps **ppGlSyncPorts);
+typedef int ( *ADL_WORKSTATION_GLSYNCGENLOCKCONFIGURATION_GET ) (int iAdapterIndex, int iGlSyncConnector, int iGlValidMask, ADLGLSyncGenlockConfig *lpGlSyncGenlockConfig);
+typedef int ( *ADL_WORKSTATION_GLSYNCGENLOCKCONFIGURATION_SET ) (int iAdapterIndex, int iGlSyncConnector, ADLGLSyncGenlockConfig glSyncGenlockConfig);
+typedef int ( *ADL_WORKSTATION_GLSYNCPORTSTATE_GET ) (int iAdapterIndex, int iGlSyncConnector, int iGlSyncPortType, int iNumLEDs, ADLGlSyncPortInfo *lpGlSyncPortInfo, int **ppGlSyncLEDs);
+typedef int ( *ADL_WORKSTATION_GLSYNCPORTSTATE_SET ) (int iAdapterIndex, int iGlSyncConnector, ADLGlSyncPortControl glSyncPortControl);
+typedef int ( *ADL_WORKSTATION_DISPLAYGLSYNCMODE_GET ) (int iAdapterIndex, int iDisplayIndex, ADLGlSyncMode *lpGlSyncMode);
+typedef int ( *ADL_WORKSTATION_DISPLAYGLSYNCMODE_SET ) (int iAdapterIndex, int iDisplayIndex, ADLGlSyncMode glSyncMode);
+typedef int ( *ADL_WORKSTATION_GLSYNCSUPPORTEDTOPOLOGY_GET ) (int iAdapterIndex, int iNumSyncModes, ADLGlSyncMode2 *glSyncModes, int *iNumSugSyncModes, ADLGlSyncMode2 **glSugSyncModes);
+typedef int ( *ADL_WORKSTATION_LOADBALANCING_GET ) (int *lpResultMask, int *lpCurResultValue, int *lpDefResultValue);
+typedef int ( *ADL_WORKSTATION_LOADBALANCING_SET ) (int iCurState);
+typedef int ( *ADL_WORKSTATION_LOADBALANCING_CAPS ) (int iAdapterIndex, int *lpResultMask, int *lpResultValue);
+
+// ------------------------------------------------------------------------------------------------------------
+
+#ifdef __linux__
+// ADL Linux
+typedef int ( *ADL_ADAPTER_MEMORYINFO_GET ) (int iAdapterIndex, ADLMemoryInfo *lpMemoryInfo);
+typedef int ( *ADL_CONTROLLER_COLOR_SET ) (int iAdapterIndex, int iControllerIndex, ADLGamma adlGamma);
+typedef int ( *ADL_CONTROLLER_COLOR_GET ) (int iAdapterIndex, int iControllerIndex, ADLGamma *lpGammaCurrent, ADLGamma *lpGammaDefault, ADLGamma *lpGammaMin, ADLGamma *lpGammaMax);
+typedef int ( *ADL_DESKTOPCONFIG_GET ) (int iAdapterIndex, int *lpDesktopConfig);
+typedef int ( *ADL_DESKTOPCONFIG_SET ) (int iAdapterIndex, int iDesktopConfig);
+typedef int ( *ADL_NUMBEROFDISPLAYENABLE_GET ) (int iAdapterIndex, int *lpNumberOfDisplays);
+typedef int ( *ADL_DISPLAYENABLE_SET ) (int iAdapterIndex, int *lpDisplayIndexList, int iDisplayListSize, int bPersistOnly);
+typedef int ( *ADL_DISPLAY_IDENTIFYDISPLAY ) (int iAdapterIndex, int iDisplayIndex, int iDisplayControllerIndex, int iShow, int iDisplayNum, int iPosX, int iPosY);
+typedef int ( *ADL_DISPLAY_LUTCOLOR_SET ) (int iAdapterIndex, int iDisplayIndex, ADLGamma adlGamma);
+typedef int ( *ADL_DISPLAY_LUTCOLOR_GET ) (int iAdapterIndex, int iDisplayIndex, ADLGamma *lpGammaCurrent, ADLGamma *lpGammaDefault, ADLGamma *lpGammaMin, ADLGamma *lpGammaMax);
+typedef int ( *ADL_ADAPTER_XSCREENINFO_GET ) (LPXScreenInfo lpXScreenInfo, int iInputSize);
+typedef int ( *ADL_DISPLAY_XRANDRDISPLAYNAME_GET ) (int iAdapterIndex, int iDisplayIndex, char *lpXrandrDisplayName, int iBuffSize);
+#endif
+// ------------------------------------------------------------------------------------------------------------
+
+
+// experimental undocumented
+typedef int ( *ADL_OVERDRIVE5_POWERCONTROL_GET ) (int iAdapterIndex, int* iPercentage, int* whatever);
+typedef int ( *ADL_OVERDRIVE5_POWERCONTROL_SET ) (int iAdapterIndex, int iPercentage);
+//typedef int ( *ADL_OVERDRIVE5_POWERCONTROL_CAPS ) (int iAdapterIndex, int* lpCaps, int* lpValid);
+//typedef int ( *ADL_OVERDRIVE5_POWERCONTROLINFO_GET) (int iAdapterIndex, ...)
diff --git a/adl/adl_sdk.h b/adl/adl_sdk.h
new file mode 100644
index 0000000..5af5a66
--- /dev/null
+++ b/adl/adl_sdk.h
@@ -0,0 +1,29 @@
+///
+///  Copyright (c) 2008 - 2013 Advanced Micro Devices, Inc.
+ 
+///  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
+///  EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
+///  WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+
+/// \file adl_sdk.h
+/// \brief Contains the definition of the Memory Allocation Callback.\n <b>Included in ADL SDK</b>
+///
+/// \n\n
+/// This file contains the definition of the Memory Allocation Callback.\n
+/// It also includes definitions of the respective structures and constants.\n
+/// <b> This is the only header file to be included in a C/C++ project using ADL </b>
+
+#ifndef ADL_SDK_H_
+#define ADL_SDK_H_
+
+#include "adl_structures.h"
+
+#if defined (LINUX)
+#define __stdcall
+#endif /* (LINUX) */
+
+/// Memory Allocation Call back 
+typedef void* ( __stdcall *ADL_MAIN_MALLOC_CALLBACK )( int );
+
+
+#endif /* ADL_SDK_H_ */
diff --git a/adl/adl_structures.h b/adl/adl_structures.h
new file mode 100644
index 0000000..ae1f0a0
--- /dev/null
+++ b/adl/adl_structures.h
@@ -0,0 +1,2769 @@
+///
+///  Copyright (c) 2008 - 2013 Advanced Micro Devices, Inc.
+
+///  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
+///  EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
+///  WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
+
+/// \file adl_structures.h
+///\brief This file contains the structure declarations that are used by the public ADL interfaces for \ALL platforms.\n <b>Included in ADL SDK</b>
+///
+/// All data structures used in AMD Display Library (ADL) public interfaces should be defined in this header file.
+///
+
+#ifndef ADL_STRUCTURES_H_
+#define ADL_STRUCTURES_H_
+
+#include "adl_defines.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the graphics adapter.
+///
+/// This structure is used to store various information about the graphics adapter.  This
+/// information can be returned to the user. Alternatively, it can be used to access various driver calls to set
+/// or fetch various settings upon the user's request.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct AdapterInfo
+{
+/// \ALL_STRUCT_MEM
+
+/// Size of the structure.
+    int iSize;
+/// The ADL index handle. One GPU may be associated with one or two index handles
+    int iAdapterIndex;
+/// The unique device ID associated with this adapter.
+    char strUDID[ADL_MAX_PATH];
+/// The BUS number associated with this adapter.
+    int iBusNumber;
+/// The driver number associated with this adapter.
+    int iDeviceNumber;
+/// The function number.
+    int iFunctionNumber;
+/// The vendor ID associated with this adapter.
+    int iVendorID;
+/// Adapter name.
+    char strAdapterName[ADL_MAX_PATH];
+/// Display name. For example, "\\\\Display0" for Windows or ":0:0" for Linux.
+    char strDisplayName[ADL_MAX_PATH];
+/// Present or not; 1 if present and 0 if not present.It the logical adapter is present, the display name such as \\\\.\\Display1 can be found from OS
+	int iPresent;
+
+#if defined (_WIN32) || defined (_WIN64)
+/// \WIN_STRUCT_MEM
+
+/// Exist or not; 1 is exist and 0 is not present.
+    int iExist;
+/// Driver registry path.
+    char strDriverPath[ADL_MAX_PATH];
+/// Driver registry path Ext for.
+    char strDriverPathExt[ADL_MAX_PATH];
+/// PNP string from Windows.
+    char strPNPString[ADL_MAX_PATH];
+/// It is generated from EnumDisplayDevices.
+    int iOSDisplayIndex;
+#endif /* (_WIN32) || (_WIN64) */
+
+#if defined (LINUX)
+/// \LNX_STRUCT_MEM
+
+/// Internal X screen number from GPUMapInfo (DEPRICATED use XScreenInfo)
+    int iXScreenNum;
+/// Internal driver index from GPUMapInfo
+    int iDrvIndex;
+/// \deprecated Internal x config file screen identifier name. Use XScreenInfo instead.
+    char strXScreenConfigName[ADL_MAX_PATH];
+
+#endif /* (LINUX) */
+} AdapterInfo, *LPAdapterInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the Linux X screen information.
+///
+/// This structure is used to store the current screen number and xorg.conf ID name assoicated with an adapter index.
+/// This structure is updated during ADL_Main_Control_Refresh or ADL_ScreenInfo_Update.
+/// Note:  This structure should be used in place of iXScreenNum and strXScreenConfigName in AdapterInfo as they will be
+/// deprecated.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+#if defined (LINUX)
+typedef struct XScreenInfo
+{
+/// Internal X screen number from GPUMapInfo.
+	int iXScreenNum;
+/// Internal x config file screen identifier name.
+    char strXScreenConfigName[ADL_MAX_PATH];
+} XScreenInfo, *LPXScreenInfo;
+#endif /* (LINUX) */
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the ASIC memory.
+///
+/// This structure is used to store various information about the ASIC memory.  This
+/// information can be returned to the user.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMemoryInfo
+{
+/// Memory size in bytes.
+    long long iMemorySize;
+/// Memory type in string.
+    char strMemoryType[ADL_MAX_PATH];
+/// Memory bandwidth in Mbytes/s.
+    long long iMemoryBandwidth;
+} ADLMemoryInfo, *LPADLMemoryInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about memory required by type
+///
+/// This structure is returned by ADL_Adapter_ConfigMemory_Get, which given a desktop and display configuration
+/// will return the Memory used.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMemoryRequired
+{
+	long long iMemoryReq;		/// Memory in bytes required
+	int iType;					/// Type of Memory \ref define_adl_validmemoryrequiredfields
+	int iDisplayFeatureValue;   /// Display features \ref define_adl_visiblememoryfeatures that are using this type of memory
+} ADLMemoryRequired, *LPADLMemoryRequired;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the features associated with a display
+///
+/// This structure is a parameter to ADL_Adapter_ConfigMemory_Get, which given a desktop and display configuration
+/// will return the Memory used.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMemoryDisplayFeatures
+{
+	int iDisplayIndex;			/// ADL Display index
+	int iDisplayFeatureValue;	/// features that the display is using \ref define_adl_visiblememoryfeatures
+} ADLMemoryDisplayFeatures, *LPADLMemoryDisplayFeatures;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing DDC information.
+///
+/// This structure is used to store various DDC information that can be returned to the user.
+/// Note that all fields of type int are actually defined as unsigned int types within the driver.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDDCInfo
+{
+/// Size of the structure
+    int  ulSize;
+/// Indicates whether the attached display supports DDC. If this field is zero on return, no other DDC information fields will be used.
+    int  ulSupportsDDC;
+/// Returns the manufacturer ID of the display device. Should be zeroed if this information is not available.
+    int  ulManufacturerID;
+/// Returns the product ID of the display device. Should be zeroed if this information is not available.
+    int  ulProductID;
+/// Returns the name of the display device. Should be zeroed if this information is not available.
+    char cDisplayName[ADL_MAX_DISPLAY_NAME];
+/// Returns the maximum Horizontal supported resolution. Should be zeroed if this information is not available.
+    int  ulMaxHResolution;
+/// Returns the maximum Vertical supported resolution. Should be zeroed if this information is not available.
+    int  ulMaxVResolution;
+/// Returns the maximum supported refresh rate. Should be zeroed if this information is not available.
+    int  ulMaxRefresh;
+/// Returns the display device preferred timing mode's horizontal resolution.
+    int  ulPTMCx;
+/// Returns the display device preferred timing mode's vertical resolution.
+    int  ulPTMCy;
+/// Returns the display device preferred timing mode's refresh rate.
+    int  ulPTMRefreshRate;
+/// Return EDID flags.
+    int  ulDDCInfoFlag;
+} ADLDDCInfo, *LPADLDDCInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing DDC information.
+///
+/// This structure is used to store various DDC information that can be returned to the user.
+/// Note that all fields of type int are actually defined as unsigned int types within the driver.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDDCInfo2
+{
+/// Size of the structure
+    int  ulSize;
+/// Indicates whether the attached display supports DDC. If this field is zero on return, no other DDC
+/// information fields will be used.
+    int  ulSupportsDDC;
+/// Returns the manufacturer ID of the display device. Should be zeroed if this information is not available.
+    int  ulManufacturerID;
+/// Returns the product ID of the display device. Should be zeroed if this information is not available.
+    int  ulProductID;
+/// Returns the name of the display device. Should be zeroed if this information is not available.
+    char cDisplayName[ADL_MAX_DISPLAY_NAME];
+/// Returns the maximum Horizontal supported resolution. Should be zeroed if this information is not available.
+    int  ulMaxHResolution;
+/// Returns the maximum Vertical supported resolution. Should be zeroed if this information is not available.
+    int  ulMaxVResolution;
+/// Returns the maximum supported refresh rate. Should be zeroed if this information is not available.
+    int  ulMaxRefresh;
+/// Returns the display device preferred timing mode's horizontal resolution.
+    int  ulPTMCx;
+/// Returns the display device preferred timing mode's vertical resolution.
+    int  ulPTMCy;
+/// Returns the display device preferred timing mode's refresh rate.
+    int  ulPTMRefreshRate;
+/// Return EDID flags.
+    int  ulDDCInfoFlag;
+/// Returns 1 if the display supported packed pixel, 0 otherwise
+    int bPackedPixelSupported;
+/// Returns the Pixel formats the display supports \ref define_ddcinfo_pixelformats
+    int iPanelPixelFormat;
+/// Return EDID serial ID.
+    int  ulSerialID;
+/// Return minimum monitor luminance data
+    int ulMinLuminanceData;
+/// Return average monitor luminance data
+    int ulAvgLuminanceData;
+/// Return maximum monitor luminance data
+    int ulMaxLuminanceData;
+
+/// Bit vector of supported transfer functions \ref define_source_content_TF
+    int iSupportedTransferFunction;
+
+/// Bit vector of supported color spaces \ref define_source_content_CS
+    int iSupportedColorSpace;
+
+/// Display Red Chromaticity X coordinate multiplied by 10000
+    int iNativeDisplayChromaticityRedX;
+/// Display Red Chromaticity Y coordinate multiplied by 10000
+    int iNativeDisplayChromaticityRedY;
+/// Display Green Chromaticity X coordinate multiplied by 10000
+    int iNativeDisplayChromaticityGreenX;
+/// Display Green Chromaticity Y coordinate multiplied by 10000
+    int iNativeDisplayChromaticityGreenY;
+/// Display Blue Chromaticity X coordinate multiplied by 10000
+    int iNativeDisplayChromaticityBlueX;
+/// Display Blue Chromaticity Y coordinate multiplied by 10000
+    int iNativeDisplayChromaticityBlueY;
+/// Display White Point X coordinate multiplied by 10000
+    int iNativeDisplayChromaticityWhitePointX;
+/// Display White Point Y coordinate multiplied by 10000
+    int iNativeDisplayChromaticityWhitePointY;
+/// Display diffuse screen reflectance 0-1 (100%) in units of 0.01
+    int iDiffuseScreenReflectance;
+/// Display specular screen reflectance 0-1 (100%) in units of 0.01
+    int iSpecularScreenReflectance;
+/// Bit vector of supported color spaces \ref define_HDR_support
+    int iSupportedHDR;
+
+    // Reserved for future use
+    int iReserved[10];
+} ADLDDCInfo2, *LPADLDDCInfo2;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information controller Gamma settings.
+///
+/// This structure is used to store the red, green and blue color channel information for the.
+/// controller gamma setting. This information is returned by ADL, and it can also be used to
+/// set the controller gamma setting.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGamma
+{
+/// Red color channel gamma value.
+	float fRed;
+/// Green color channel gamma value.
+	float fGreen;
+/// Blue color channel gamma value.
+	float fBlue;
+} ADLGamma, *LPADLGamma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about component video custom modes.
+///
+/// This structure is used to store the component video custom mode.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLCustomMode
+{
+/// Custom mode flags.  They are returned by the ADL driver.
+	int iFlags;
+/// Custom mode width.
+	int iModeWidth;
+/// Custom mode height.
+	int iModeHeight;
+/// Custom mode base width.
+	int iBaseModeWidth;
+/// Custom mode base height.
+	int iBaseModeHeight;
+/// Custom mode refresh rate.
+	int iRefreshRate;
+} ADLCustomMode, *LPADLCustomMode;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing Clock information for OD5 calls.
+///
+/// This structure is used to retrieve clock information for OD5 calls.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGetClocksOUT
+{
+    long ulHighCoreClock;
+    long ulHighMemoryClock;
+    long ulHighVddc;
+    long ulCoreMin;
+    long ulCoreMax;
+    long ulMemoryMin;
+    long ulMemoryMax;
+    long ulActivityPercent;
+    long ulCurrentCoreClock;
+    long ulCurrentMemoryClock;
+    long ulReserved;
+} ADLGetClocksOUT;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing HDTV information for display calls.
+///
+/// This structure is used to retrieve HDTV information information for display calls.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayConfig
+{
+/// Size of the structure
+  long ulSize;
+/// HDTV connector type.
+  long ulConnectorType;
+/// HDTV capabilities.
+  long ulDeviceData;
+/// Overridden HDTV capabilities.
+  long ulOverridedDeviceData;
+/// Reserved field
+  long ulReserved;
+} ADLDisplayConfig;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display device.
+///
+/// This structure is used to store display device information
+/// such as display index, type, name, connection status, mapped adapter and controller indexes,
+/// whether or not multiple VPUs are supported, local display connections or not (through Lasso), etc.
+/// This information can be returned to the user. Alternatively, it can be used to access various driver calls to set
+/// or fetch various display device related settings upon the user's request.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayID
+{
+/// The logical display index belonging to this adapter.
+	int iDisplayLogicalIndex;
+
+///\brief The physical display index.
+/// For example, display index 2 from adapter 2 can be used by current adapter 1.\n
+/// So current adapter may enumerate this adapter as logical display 7 but the physical display
+/// index is still 2.
+	int iDisplayPhysicalIndex;
+
+/// The persistent logical adapter index for the display.
+	int iDisplayLogicalAdapterIndex;
+
+///\brief The persistent physical adapter index for the display.
+/// It can be the current adapter or a non-local adapter. \n
+/// If this adapter index is different than the current adapter,
+/// the Display Non Local flag is set inside DisplayInfoValue.
+    int iDisplayPhysicalAdapterIndex;
+} ADLDisplayID, *LPADLDisplayID;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display device.
+///
+/// This structure is used to store various information about the display device.  This
+/// information can be returned to the user, or used to access various driver calls to set
+/// or fetch various display-device-related settings upon the user's request
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayInfo
+{
+/// The DisplayID structure
+	ADLDisplayID displayID;
+
+///\deprecated The controller index to which the display is mapped.\n Will not be used in the future\n
+	int  iDisplayControllerIndex;
+
+/// The display's EDID name.
+	char strDisplayName[ADL_MAX_PATH];
+
+/// The display's manufacturer name.
+	char strDisplayManufacturerName[ADL_MAX_PATH];
+
+/// The Display type. For example: CRT, TV, CV, DFP.
+	int  iDisplayType;
+
+/// The display output type. For example: HDMI, SVIDEO, COMPONMNET VIDEO.
+	int  iDisplayOutputType;
+
+/// The connector type for the device.
+	int  iDisplayConnector;
+
+///\brief The bit mask identifies the number of bits ADLDisplayInfo is currently using. \n
+/// It will be the sum all the bit definitions in ADL_DISPLAY_DISPLAYINFO_xxx.
+	int  iDisplayInfoMask;
+
+/// The bit mask identifies the display status. \ref define_displayinfomask
+	int  iDisplayInfoValue;
+} ADLDisplayInfo, *LPADLDisplayInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display port MST device.
+///
+/// This structure is used to store various MST information about the display port device.  This
+/// information can be returned to the user, or used to access various driver calls to
+/// fetch various display-device-related settings upon the user's request
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayDPMSTInfo
+{
+	/// The ADLDisplayID structure
+	ADLDisplayID displayID;
+
+	/// total bandwidth available on the DP connector
+	int	iTotalAvailableBandwidthInMpbs;
+	/// bandwidth allocated to this display
+	int	iAllocatedBandwidthInMbps;
+
+	// info from DAL DpMstSinkInfo
+	/// string identifier for the display
+	char	strGlobalUniqueIdentifier[ADL_MAX_PATH];
+
+	/// The link count of relative address, rad[0] upto rad[linkCount] are valid
+	int		radLinkCount;
+	/// The physical connector ID, used to identify the physical DP port
+	int		iPhysicalConnectorID;
+
+	/// Relative address, address scheme starts from source side
+	char	rad[ADL_MAX_RAD_LINK_COUNT];
+} ADLDisplayDPMSTInfo, *LPADLDisplayDPMSTInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the display mode definition used per controller.
+///
+/// This structure is used to store the display mode definition used per controller.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayMode
+{
+/// Vertical resolution (in pixels).
+   int  iPelsHeight;
+/// Horizontal resolution (in pixels).
+   int  iPelsWidth;
+/// Color depth.
+   int  iBitsPerPel;
+/// Refresh rate.
+   int  iDisplayFrequency;
+} ADLDisplayMode;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing detailed timing parameters.
+///
+/// This structure is used to store the detailed timing parameters.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDetailedTiming
+{
+/// Size of the structure.
+     int   iSize;
+/// Timing flags. \ref define_detailed_timing_flags
+     short sTimingFlags;
+/// Total width (columns).
+     short sHTotal;
+/// Displayed width.
+     short sHDisplay;
+/// Horizontal sync signal offset.
+     short sHSyncStart;
+/// Horizontal sync signal width.
+     short sHSyncWidth;
+/// Total height (rows).
+     short sVTotal;
+/// Displayed height.
+     short sVDisplay;
+/// Vertical sync signal offset.
+     short sVSyncStart;
+/// Vertical sync signal width.
+     short sVSyncWidth;
+/// Pixel clock value.
+     short sPixelClock;
+/// Overscan right.
+     short sHOverscanRight;
+/// Overscan left.
+     short sHOverscanLeft;
+/// Overscan bottom.
+     short sVOverscanBottom;
+/// Overscan top.
+     short sVOverscanTop;
+     short sOverscan8B;
+     short sOverscanGR;
+} ADLDetailedTiming;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing display mode information.
+///
+/// This structure is used to store the display mode information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayModeInfo
+{
+/// Timing standard of the current mode. \ref define_modetiming_standard
+  int  iTimingStandard;
+/// Applicable timing standards for the current mode.
+  int  iPossibleStandard;
+/// Refresh rate factor.
+  int  iRefreshRate;
+/// Num of pixels in a row.
+  int  iPelsWidth;
+/// Num of pixels in a column.
+  int  iPelsHeight;
+/// Detailed timing parameters.
+  ADLDetailedTiming  sDetailedTiming;
+} ADLDisplayModeInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about display property.
+///
+/// This structure is used to store the display property for the current adapter.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayProperty
+{
+/// Must be set to sizeof the structure
+  int iSize;
+/// Must be set to \ref ADL_DL_DISPLAYPROPERTY_TYPE_EXPANSIONMODE or \ref ADL_DL_DISPLAYPROPERTY_TYPE_USEUNDERSCANSCALING
+  int iPropertyType;
+/// Get or Set \ref ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_CENTER or \ref ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_FULLSCREEN or \ref ADL_DL_DISPLAYPROPERTY_EXPANSIONMODE_ASPECTRATIO or \ref ADL_DL_DISPLAYPROPERTY_TYPE_ITCFLAGENABLE
+  int iExpansionMode;
+/// Display Property supported? 1: Supported, 0: Not supported
+  int iSupport;
+/// Display Property current value
+  int iCurrent;
+/// Display Property Default value
+  int iDefault;
+} ADLDisplayProperty;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Clock.
+///
+/// This structure is used to store the clock information for the current adapter
+/// such as core clock and memory clock info.
+///\nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLClockInfo
+{
+/// Core clock in 10 KHz.
+    int iCoreClock;
+/// Memory clock in 10 KHz.
+    int iMemoryClock;
+} ADLClockInfo, *LPADLClockInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about I2C.
+///
+/// This structure is used to store the I2C information for the current adapter.
+/// This structure is used by the ADL_Display_WriteAndReadI2C() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLI2C
+{
+/// Size of the structure
+    int iSize;
+/// Numerical value representing hardware I2C.
+    int iLine;
+/// The 7-bit I2C slave device address, shifted one bit to the left.
+    int iAddress;
+/// The offset of the data from the address.
+    int iOffset;
+/// Read from or write to slave device. \ref ADL_DL_I2C_ACTIONREAD or \ref ADL_DL_I2C_ACTIONWRITE or \ref ADL_DL_I2C_ACTIONREAD_REPEATEDSTART
+    int iAction;
+/// I2C clock speed in KHz.
+    int iSpeed;
+/// A numerical value representing the number of bytes to be sent or received on the I2C bus.
+    int iDataSize;
+/// Address of the characters which are to be sent or received on the I2C bus.
+    char *pcData;
+} ADLI2C;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about EDID data.
+///
+/// This structure is used to store the information about EDID data for the adapter.
+/// This structure is used by the ADL_Display_EdidData_Get() and ADL_Display_EdidData_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayEDIDData
+{
+/// Size of the structure
+  int iSize;
+/// Set to 0
+  int iFlag;
+  /// Size of cEDIDData. Set by ADL_Display_EdidData_Get() upon return
+  int iEDIDSize;
+/// 0, 1 or 2. If set to 3 or above an error ADL_ERR_INVALID_PARAM is generated
+  int iBlockIndex;
+/// EDID data
+  char cEDIDData[ADL_MAX_EDIDDATA_SIZE];
+/// Reserved
+  int iReserved[4];
+}ADLDisplayEDIDData;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about input of controller overlay adjustment.
+///
+/// This structure is used to store the information about input of controller overlay adjustment for the adapter.
+/// This structure is used by the ADL_Display_ControllerOverlayAdjustmentCaps_Get, ADL_Display_ControllerOverlayAdjustmentData_Get, and
+/// ADL_Display_ControllerOverlayAdjustmentData_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLControllerOverlayInput
+{
+/// Should be set to the sizeof the structure
+  int  iSize;
+///\ref ADL_DL_CONTROLLER_OVERLAY_ALPHA or \ref ADL_DL_CONTROLLER_OVERLAY_ALPHAPERPIX
+  int  iOverlayAdjust;
+/// Data.
+  int  iValue;
+/// Should be 0.
+  int  iReserved;
+} ADLControllerOverlayInput;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about overlay adjustment.
+///
+/// This structure is used to store the information about overlay adjustment for the adapter.
+/// This structure is used by the ADLControllerOverlayInfo() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdjustmentinfo
+{
+/// Default value
+  int iDefault;
+/// Minimum value
+  int iMin;
+/// Maximum Value
+  int iMax;
+/// Step value
+  int iStep;
+} ADLAdjustmentinfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about controller overlay information.
+///
+/// This structure is used to store information about controller overlay info for the adapter.
+/// This structure is used by the ADL_Display_ControllerOverlayAdjustmentCaps_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLControllerOverlayInfo
+{
+/// Should be set to the sizeof the structure
+  int					iSize;
+/// Data.
+  ADLAdjustmentinfo	    sOverlayInfo;
+/// Should be 0.
+  int					iReserved[3];
+} ADLControllerOverlayInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync module information.
+///
+/// This structure is used to retrieve GL-Sync module information for
+/// Workstation Framelock/Genlock.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGLSyncModuleID
+{
+/// Unique GL-Sync module ID.
+	int		iModuleID;
+/// GL-Sync GPU port index (to be passed into ADLGLSyncGenlockConfig.lSignalSource and ADLGlSyncPortControl.lSignalSource).
+	int		iGlSyncGPUPort;
+/// GL-Sync module firmware version of Boot Sector.
+	int		iFWBootSectorVersion;
+/// GL-Sync module firmware version of User Sector.
+	int		iFWUserSectorVersion;
+} ADLGLSyncModuleID , *LPADLGLSyncModuleID;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync ports capabilities.
+///
+/// This structure is used to retrieve hardware capabilities for the ports of the GL-Sync module
+/// for Workstation Framelock/Genlock (such as port type and number of associated LEDs).
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGLSyncPortCaps
+{
+/// Port type. Bitfield of ADL_GLSYNC_PORTTYPE_*  \ref define_glsync
+	int		iPortType;
+/// Number of LEDs associated for this port.
+	int		iNumOfLEDs;
+}ADLGLSyncPortCaps, *LPADLGLSyncPortCaps;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync Genlock settings.
+///
+/// This structure is used to get and set genlock settings for the GPU ports of the GL-Sync module
+/// for Workstation Framelock/Genlock.\n
+/// \see define_glsync
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGLSyncGenlockConfig
+{
+/// Specifies what fields in this structure are valid \ref define_glsync
+	int		iValidMask;
+/// Delay (ms) generating a sync signal.
+	int		iSyncDelay;
+/// Vector of framelock control bits. Bitfield of ADL_GLSYNC_FRAMELOCKCNTL_* \ref define_glsync
+	int		iFramelockCntlVector;
+/// Source of the sync signal. Either GL_Sync GPU Port index or ADL_GLSYNC_SIGNALSOURCE_* \ref define_glsync
+	int		iSignalSource;
+/// Use sampled sync signal. A value of 0 specifies no sampling.
+	int		iSampleRate;
+/// For interlaced sync signals, the value can be ADL_GLSYNC_SYNCFIELD_1 or *_BOTH \ref define_glsync
+	int		iSyncField;
+/// The signal edge that should trigger synchronization. ADL_GLSYNC_TRIGGEREDGE_* \ref define_glsync
+	int		iTriggerEdge;
+/// Scan rate multiplier applied to the sync signal. ADL_GLSYNC_SCANRATECOEFF_* \ref define_glsync
+	int		iScanRateCoeff;
+}ADLGLSyncGenlockConfig, *LPADLGLSyncGenlockConfig;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync port information.
+///
+/// This structure is used to get status of the GL-Sync ports (BNC or RJ45s)
+/// for Workstation Framelock/Genlock.
+/// \see define_glsync
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGlSyncPortInfo
+{
+/// Type of GL-Sync port (ADL_GLSYNC_PORT_*).
+	int		iPortType;
+/// The number of LEDs for this port. It's also filled within ADLGLSyncPortCaps.
+	int		iNumOfLEDs;
+/// Port state ADL_GLSYNC_PORTSTATE_*  \ref define_glsync
+	int		iPortState;
+/// Scanned frequency for this port (vertical refresh rate in milliHz; 60000 means 60 Hz).
+	int		iFrequency;
+/// Used for ADL_GLSYNC_PORT_BNC. It is ADL_GLSYNC_SIGNALTYPE_*   \ref define_glsync
+	int		iSignalType;
+/// Used for ADL_GLSYNC_PORT_RJ45PORT*. It is GL_Sync GPU Port index or ADL_GLSYNC_SIGNALSOURCE_*.  \ref define_glsync
+	int		iSignalSource;
+
+} ADLGlSyncPortInfo, *LPADLGlSyncPortInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync port control settings.
+///
+/// This structure is used to configure the GL-Sync ports (RJ45s only)
+/// for Workstation Framelock/Genlock.
+/// \see define_glsync
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGlSyncPortControl
+{
+/// Port to control ADL_GLSYNC_PORT_RJ45PORT1 or ADL_GLSYNC_PORT_RJ45PORT2   \ref define_glsync
+	int		iPortType;
+/// Port control data ADL_GLSYNC_PORTCNTL_*   \ref define_glsync
+	int		iControlVector;
+/// Source of the sync signal. Either GL_Sync GPU Port index or ADL_GLSYNC_SIGNALSOURCE_*   \ref define_glsync
+	int		iSignalSource;
+} ADLGlSyncPortControl;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync mode of a display.
+///
+/// This structure is used to get and set GL-Sync mode settings for a display connected to
+/// an adapter attached to a GL-Sync module for Workstation Framelock/Genlock.
+/// \see define_glsync
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGlSyncMode
+{
+/// Mode control vector. Bitfield of ADL_GLSYNC_MODECNTL_*   \ref define_glsync
+	int		iControlVector;
+/// Mode status vector. Bitfield of ADL_GLSYNC_MODECNTL_STATUS_*   \ref define_glsync
+	int		iStatusVector;
+/// Index of GL-Sync connector used to genlock the display/controller.
+	int		iGLSyncConnectorIndex;
+} ADLGlSyncMode, *LPADLGlSyncMode;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing GL-Sync mode of a display.
+///
+/// This structure is used to get and set GL-Sync mode settings for a display connected to
+/// an adapter attached to a GL-Sync module for Workstation Framelock/Genlock.
+/// \see define_glsync
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGlSyncMode2
+{
+/// Mode control vector. Bitfield of ADL_GLSYNC_MODECNTL_*   \ref define_glsync
+	int		iControlVector;
+/// Mode status vector. Bitfield of ADL_GLSYNC_MODECNTL_STATUS_*   \ref define_glsync
+	int		iStatusVector;
+/// Index of GL-Sync connector used to genlock the display/controller.
+	int		iGLSyncConnectorIndex;
+/// Index of the display to which this GLSync applies to.
+	int		iDisplayIndex;
+} ADLGlSyncMode2, *LPADLGlSyncMode2;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the packet info of a display.
+///
+/// This structure is used to get and set the packet information of a display.
+/// This structure is used by ADLDisplayDataPacket.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct  ADLInfoPacket
+{
+	char hb0;
+	char hb1;
+	char hb2;
+/// sb0~sb27
+	char sb[28];
+}ADLInfoPacket;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the AVI packet info of a display.
+///
+/// This structure is used to get and set AVI the packet info of a display.
+/// This structure is used by ADLDisplayDataPacket.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAVIInfoPacket  //Valid user defined data/
+{
+/// byte 3, bit 7
+   char bPB3_ITC;
+/// byte 5, bit [7:4].
+   char bPB5;
+}ADLAVIInfoPacket;
+
+// Overdrive clock setting structure definition.
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the Overdrive clock setting.
+///
+/// This structure is used to get the Overdrive clock setting.
+/// This structure is used by ADLAdapterODClockInfo.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODClockSetting
+{
+/// Deafult clock
+	int iDefaultClock;
+/// Current clock
+	int iCurrentClock;
+/// Maximum clcok
+	int iMaxClock;
+/// Minimum clock
+	int iMinClock;
+/// Requested clcock
+	int iRequestedClock;
+/// Step
+	int iStepClock;
+} ADLODClockSetting;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the Overdrive clock information.
+///
+/// This structure is used to get the Overdrive clock information.
+/// This structure is used by the ADL_Display_ODClockInfo_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdapterODClockInfo
+{
+/// Size of the structure
+	int iSize;
+/// Flag \ref define_clockinfo_flags
+	int iFlags;
+/// Memory Clock
+	ADLODClockSetting sMemoryClock;
+/// Engine Clock
+	ADLODClockSetting sEngineClock;
+} ADLAdapterODClockInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the Overdrive clock configuration.
+///
+/// This structure is used to set the Overdrive clock configuration.
+/// This structure is used by the ADL_Display_ODClockConfig_Set() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdapterODClockConfig
+{
+/// Size of the structure
+  int iSize;
+/// Flag \ref define_clockinfo_flags
+  int iFlags;
+/// Memory Clock
+  int iMemoryClock;
+/// Engine Clock
+  int iEngineClock;
+} ADLAdapterODClockConfig;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about current power management related activity.
+///
+/// This structure is used to store information about current power management related activity.
+/// This structure (Overdrive 5 interfaces) is used by the ADL_PM_CurrentActivity_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPMActivity
+{
+/// Must be set to the size of the structure
+	int iSize;
+/// Current engine clock.
+	int iEngineClock;
+/// Current memory clock.
+	int iMemoryClock;
+/// Current core voltage.
+	int iVddc;
+/// GPU utilization.
+	int iActivityPercent;
+/// Performance level index.
+	int iCurrentPerformanceLevel;
+/// Current PCIE bus speed.
+	int iCurrentBusSpeed;
+/// Number of PCIE bus lanes.
+	int iCurrentBusLanes;
+/// Maximum number of PCIE bus lanes.
+	int iMaximumBusLanes;
+/// Reserved for future purposes.
+	int iReserved;
+} ADLPMActivity;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about thermal controller.
+///
+/// This structure is used to store information about thermal controller.
+/// This structure is used by ADL_PM_ThermalDevices_Enum.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLThermalControllerInfo
+{
+/// Must be set to the size of the structure
+  int iSize;
+/// Possible valies: \ref ADL_DL_THERMAL_DOMAIN_OTHER or \ref ADL_DL_THERMAL_DOMAIN_GPU.
+  int iThermalDomain;
+///	GPU 0, 1, etc.
+  int iDomainIndex;
+/// Possible valies: \ref ADL_DL_THERMAL_FLAG_INTERRUPT or \ref ADL_DL_THERMAL_FLAG_FANCONTROL
+  int iFlags;
+} ADLThermalControllerInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about thermal controller temperature.
+///
+/// This structure is used to store information about thermal controller temperature.
+/// This structure is used by the ADL_PM_Temperature_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLTemperature
+{
+/// Must be set to the size of the structure
+  int iSize;
+/// Temperature in millidegrees Celsius.
+  int iTemperature;
+} ADLTemperature;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about thermal controller fan speed.
+///
+/// This structure is used to store information about thermal controller fan speed.
+/// This structure is used by the ADL_PM_FanSpeedInfo_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLFanSpeedInfo
+{
+/// Must be set to the size of the structure
+  int iSize;
+/// \ref define_fanctrl
+  int iFlags;
+/// Minimum possible fan speed value in percents.
+  int iMinPercent;
+/// Maximum possible fan speed value in percents.
+  int iMaxPercent;
+/// Minimum possible fan speed value in RPM.
+  int iMinRPM;
+/// Maximum possible fan speed value in RPM.
+  int iMaxRPM;
+} ADLFanSpeedInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about fan speed reported by thermal controller.
+///
+/// This structure is used to store information about fan speed reported by thermal controller.
+/// This structure is used by the ADL_Overdrive5_FanSpeed_Get() and ADL_Overdrive5_FanSpeed_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLFanSpeedValue
+{
+/// Must be set to the size of the structure
+  int iSize;
+/// Possible valies: \ref ADL_DL_FANCTRL_SPEED_TYPE_PERCENT or \ref ADL_DL_FANCTRL_SPEED_TYPE_RPM
+  int iSpeedType;
+/// Fan speed value
+  int iFanSpeed;
+/// The only flag for now is: \ref ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED
+  int iFlags;
+} ADLFanSpeedValue;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the range of Overdrive parameter.
+///
+/// This structure is used to store information about the range of Overdrive parameter.
+/// This structure is used by ADLODParameters.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODParameterRange
+{
+/// Minimum parameter value.
+  int iMin;
+/// Maximum parameter value.
+  int iMax;
+/// Parameter step value.
+  int iStep;
+} ADLODParameterRange;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive parameters.
+///
+/// This structure is used to store information about Overdrive parameters.
+/// This structure is used by the ADL_Overdrive5_ODParameters_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODParameters
+{
+/// Must be set to the size of the structure
+  int iSize;
+/// Number of standard performance states.
+  int iNumberOfPerformanceLevels;
+/// Indicates whether the GPU is capable to measure its activity.
+  int iActivityReportingSupported;
+/// Indicates whether the GPU supports discrete performance levels or performance range.
+  int iDiscretePerformanceLevels;
+/// Reserved for future use.
+  int iReserved;
+/// Engine clock range.
+  ADLODParameterRange sEngineClock;
+/// Memory clock range.
+  ADLODParameterRange sMemoryClock;
+/// Core voltage range.
+  ADLODParameterRange sVddc;
+} ADLODParameters;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive level.
+///
+/// This structure is used to store information about Overdrive level.
+/// This structure is used by ADLODPerformanceLevels.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODPerformanceLevel
+{
+/// Engine clock.
+  int iEngineClock;
+/// Memory clock.
+  int iMemoryClock;
+/// Core voltage.
+  int iVddc;
+} ADLODPerformanceLevel;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive performance levels.
+///
+/// This structure is used to store information about Overdrive performance levels.
+/// This structure is used by the ADL_Overdrive5_ODPerformanceLevels_Get() and ADL_Overdrive5_ODPerformanceLevels_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODPerformanceLevels
+{
+/// Must be set to sizeof( \ref ADLODPerformanceLevels ) + sizeof( \ref ADLODPerformanceLevel ) * (ADLODParameters.iNumberOfPerformanceLevels - 1)
+  int iSize;
+  int iReserved;
+/// Array of performance state descriptors. Must have ADLODParameters.iNumberOfPerformanceLevels elements.
+  ADLODPerformanceLevel aLevels [1];
+} ADLODPerformanceLevels;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the proper CrossfireX chains combinations.
+///
+/// This structure is used to store information about the CrossfireX chains combination for a particular adapter.
+/// This structure is used by the ADL_Adapter_Crossfire_Caps(), ADL_Adapter_Crossfire_Get(), and ADL_Adapter_Crossfire_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLCrossfireComb
+{
+/// Number of adapters in this combination.
+  int iNumLinkAdapter;
+/// A list of ADL indexes of the linked adapters in this combination.
+  int iAdaptLink[3];
+} ADLCrossfireComb;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing CrossfireX state and error information.
+///
+/// This structure is used to store state and error information about a particular adapter CrossfireX combination.
+/// This structure is used by the ADL_Adapter_Crossfire_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLCrossfireInfo
+{
+/// Current error code of this CrossfireX combination.
+  int iErrorCode;
+/// Current \ref define_crossfirestate
+  int iState;
+/// If CrossfireX is supported by this combination. The value is either \ref ADL_TRUE or \ref ADL_FALSE.
+  int iSupported;
+} ADLCrossfireInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about the BIOS.
+///
+/// This structure is used to store various information about the Chipset.  This
+/// information can be returned to the user.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLBiosInfo
+{
+	char strPartNumber[ADL_MAX_PATH];	///< Part number.
+	char strVersion[ADL_MAX_PATH];		///< Version number.
+	char strDate[ADL_MAX_PATH];		///< BIOS date in yyyy/mm/dd hh:mm format.
+} ADLBiosInfo, *LPADLBiosInfo;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about adapter location.
+///
+/// This structure is used to store information about adapter location.
+/// This structure is used by ADLMVPUStatus.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdapterLocation
+{
+/// PCI Bus number : 8 bits
+	int iBus;
+/// Device number : 5 bits
+	int iDevice;
+/// Function number : 3 bits
+	int iFunction;
+} ADLAdapterLocation,ADLBdf;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing version information
+///
+/// This structure is used to store software version information, description of the display device and a web link to the latest installed Catalyst drivers.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLVersionsInfo
+{
+	/// Driver Release (Packaging) Version (e.g. 8.71-100128n-094835E-ATI)
+	char strDriverVer[ADL_MAX_PATH];
+	/// Catalyst Version(e.g. "10.1").
+	char strCatalystVersion[ADL_MAX_PATH];
+	/// Web link to an XML file with information about the latest AMD drivers and locations (e.g. "http://www.amd.com/us/driverxml" )
+	char strCatalystWebLink[ADL_MAX_PATH];
+
+} ADLVersionsInfo, *LPADLVersionsInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing version information
+///
+/// This structure is used to store software version information, description of the display device and a web link to the latest installed Catalyst drivers.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLVersionsInfoX2
+{
+	/// Driver Release (Packaging) Version (e.g. "16.20.1035-160621a-303814C")
+	char strDriverVer[ADL_MAX_PATH];
+	/// Catalyst Version(e.g. "15.8").
+	char strCatalystVersion[ADL_MAX_PATH];
+	/// Crimson Version(e.g. "16.6.2").
+	char strCrimsonVersion[ADL_MAX_PATH];
+	/// Web link to an XML file with information about the latest AMD drivers and locations (e.g. "http://support.amd.com/drivers/xml/driver_09_us.xml" )
+	char strCatalystWebLink[ADL_MAX_PATH];
+
+} ADLVersionsInfoX2, *LPADLVersionsInfoX2;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about MultiVPU capabilities.
+///
+/// This structure is used to store information about MultiVPU capabilities.
+/// This structure is used by the ADL_Display_MVPUCaps_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMVPUCaps
+{
+/// Must be set to sizeof( ADLMVPUCaps ).
+  int iSize;
+/// Number of adapters.
+  int iAdapterCount;
+/// Bits set for all possible MVPU masters. \ref MVPU_ADAPTER_0 .. \ref MVPU_ADAPTER_3
+  int iPossibleMVPUMasters;
+/// Bits set for all possible MVPU slaves. \ref MVPU_ADAPTER_0 .. \ref MVPU_ADAPTER_3
+  int iPossibleMVPUSlaves;
+/// Registry path for each adapter.
+  char cAdapterPath[ADL_DL_MAX_MVPU_ADAPTERS][ADL_DL_MAX_REGISTRY_PATH];
+} ADLMVPUCaps;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about MultiVPU status.
+///
+/// This structure is used to store information about MultiVPU status.
+/// Ths structure is used by the ADL_Display_MVPUStatus_Get() function.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMVPUStatus
+{
+/// Must be set to sizeof( ADLMVPUStatus ).
+  int iSize;
+/// Number of active adapters.
+  int iActiveAdapterCount;
+/// MVPU status.
+  int iStatus;
+/// PCI Bus/Device/Function for each active adapter participating in MVPU.
+  ADLAdapterLocation aAdapterLocation[ADL_DL_MAX_MVPU_ADAPTERS];
+} ADLMVPUStatus;
+
+// Displays Manager structures
+
+///////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about the activatable source.
+///
+/// This structure is used to store activatable source information
+/// This information can be returned to the user.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLActivatableSource
+{
+	/// The Persistent logical Adapter Index.
+    int iAdapterIndex;
+	/// The number of Activatable Sources.
+    int iNumActivatableSources;
+	/// The bit mask identifies the number of bits ActivatableSourceValue is using. (Not currnetly used)
+	int iActivatableSourceMask;
+	/// The bit mask identifies the status.  (Not currnetly used)
+	int iActivatableSourceValue;
+} ADLActivatableSource, *LPADLActivatableSource;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about display mode.
+///
+/// This structure is used to store the display mode for the current adapter
+/// such as X, Y positions, screen resolutions, orientation,
+/// color depth, refresh rate, progressive or interlace mode, etc.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct ADLMode
+{
+/// Adapter index.
+    int iAdapterIndex;
+/// Display IDs.
+    ADLDisplayID displayID;
+/// Screen position X coordinate.
+    int iXPos;
+/// Screen position Y coordinate.
+    int iYPos;
+/// Screen resolution Width.
+    int iXRes;
+/// Screen resolution Height.
+    int iYRes;
+/// Screen Color Depth. E.g., 16, 32.
+    int iColourDepth;
+/// Screen refresh rate. Could be fractional E.g. 59.97
+    float fRefreshRate;
+/// Screen orientation. E.g., 0, 90, 180, 270.
+    int iOrientation;
+/// Vista mode flag indicating Progressive or Interlaced mode.
+    int iModeFlag;
+/// The bit mask identifying the number of bits this Mode is currently using. It is the sum of all the bit definitions defined in \ref define_displaymode
+    int iModeMask;
+/// The bit mask identifying the display status. The detailed definition is in  \ref define_displaymode
+    int iModeValue;
+} ADLMode, *LPADLMode;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about display target information.
+///
+/// This structure is used to store the display target information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayTarget
+{
+	/// The Display ID.
+	ADLDisplayID displayID;
+
+	/// The display map index identify this manner and the desktop surface.
+	int iDisplayMapIndex;
+
+	/// The bit mask identifies the number of bits DisplayTarget is currently using. It is the sum of all the bit definitions defined in \ref ADL_DISPLAY_DISPLAYTARGET_PREFERRED.
+	int  iDisplayTargetMask;
+
+	/// The bit mask identifies the display status. The detailed definition is in \ref ADL_DISPLAY_DISPLAYTARGET_PREFERRED.
+    int  iDisplayTargetValue;
+
+} ADLDisplayTarget, *LPADLDisplayTarget;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display SLS bezel Mode information.
+///
+/// This structure is used to store the display SLS bezel Mode information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct tagADLBezelTransientMode
+{
+	/// Adapter Index
+    int iAdapterIndex;
+
+	/// SLS Map Index
+    int iSLSMapIndex;
+
+	/// The mode index
+    int iSLSModeIndex;
+
+	/// The mode
+	ADLMode displayMode;
+
+	/// The number of bezel offsets belongs to this map
+    int  iNumBezelOffset;
+
+	/// The first bezel offset array index in the native mode array
+    int  iFirstBezelOffsetArrayIndex;
+
+    /// The bit mask identifies the bits this structure is currently using. It will be the total OR of all the bit definitions.
+    int  iSLSBezelTransientModeMask;
+
+    /// The bit mask identifies the display status. The detail definition is defined below.
+	int  iSLSBezelTransientModeValue;
+
+} ADLBezelTransientMode, *LPADLBezelTransientMode;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about the adapter display manner.
+///
+/// This structure is used to store adapter display manner information
+/// This information can be returned to the user. Alternatively, it can be used to access various driver calls to
+/// fetch various display device related display manner settings upon the user's request.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdapterDisplayCap
+{
+	/// The Persistent logical Adapter Index.
+    int iAdapterIndex;
+	/// The bit mask identifies the number of bits AdapterDisplayCap is currently using. Sum all the bits defined in ADL_ADAPTER_DISPLAYCAP_XXX
+    int  iAdapterDisplayCapMask;
+	/// The bit mask identifies the status. Refer to ADL_ADAPTER_DISPLAYCAP_XXX
+    int  iAdapterDisplayCapValue;
+} ADLAdapterDisplayCap, *LPADLAdapterDisplayCap;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about display mapping.
+///
+/// This structure is used to store the display mapping data such as display manner.
+/// For displays with horizontal or vertical stretch manner,
+/// this structure also stores the display order, display row, and column data.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayMap
+{
+/// The current display map index. It is the OS desktop index. For example, if the OS index 1 is showing clone mode, the display map will be 1.
+	int iDisplayMapIndex;
+
+/// The Display Mode for the current map
+	ADLMode displayMode;
+
+/// The number of display targets belongs to this map\n
+	int iNumDisplayTarget;
+
+/// The first target array index in the Target array\n
+	int iFirstDisplayTargetArrayIndex;
+
+/// The bit mask identifies the number of bits DisplayMap is currently using. It is the sum of all the bit definitions defined in ADL_DISPLAY_DISPLAYMAP_MANNER_xxx.
+ 	int  iDisplayMapMask;
+
+///The bit mask identifies the display status. The detailed definition is in ADL_DISPLAY_DISPLAYMAP_MANNER_xxx.
+	int  iDisplayMapValue;
+
+} ADLDisplayMap, *LPADLDisplayMap;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about the display device possible map for one GPU
+///
+/// This structure is used to store the display device possible map
+/// This information can be returned to the user.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPossibleMap
+{
+	/// The current PossibleMap index. Each PossibleMap is assigned an index
+    int iIndex;
+	/// The adapter index identifying the GPU for which to validate these Maps & Targets
+	int iAdapterIndex;
+	/// Number of display Maps for this GPU to be validated
+    int iNumDisplayMap;
+	/// The display Maps list to validate
+    ADLDisplayMap* displayMap;
+	/// the number of display Targets for these display Maps
+    int iNumDisplayTarget;
+	/// The display Targets list for these display Maps to be validated.
+    ADLDisplayTarget* displayTarget;
+} ADLPossibleMap, *LPADLPossibleMap;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about display possible mapping.
+///
+/// This structure is used to store the display possible mapping's controller index for the current display.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPossibleMapping
+{
+    int iDisplayIndex;				///< The display index. Each display is assigned an index.
+	int iDisplayControllerIndex;	///< The controller index to which display is mapped.
+	int iDisplayMannerSupported;	///< The supported display manner.
+} ADLPossibleMapping, *LPADLPossibleMapping;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Structure containing information about the validated display device possible map result.
+///
+/// This structure is used to store the validated display device possible map result
+/// This information can be returned to the user.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPossibleMapResult
+{
+	/// The current display map index. It is the OS Desktop index. For example, OS Index 1 showing clone mode. The Display Map will be 1.
+    int iIndex;
+	// The bit mask identifies the number of bits   PossibleMapResult is currently using. It will be the sum all the bit definitions defined in ADL_DISPLAY_POSSIBLEMAPRESULT_VALID.
+	int iPossibleMapResultMask;
+	/// The bit mask identifies the possible map result. The detail definition is defined in ADL_DISPLAY_POSSIBLEMAPRESULT_XXX.
+	int iPossibleMapResultValue;
+} ADLPossibleMapResult, *LPADLPossibleMapResult;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display SLS Grid information.
+///
+/// This structure is used to store the display SLS Grid information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSLSGrid
+{
+/// The Adapter index.
+	int iAdapterIndex;
+
+/// The grid index.
+	int  iSLSGridIndex;
+
+/// The grid row.
+	int  iSLSGridRow;
+
+/// The grid column.
+	int  iSLSGridColumn;
+
+/// The grid bit mask identifies the number of bits DisplayMap is currently using. Sum of all bits defined in ADL_DISPLAY_SLSGRID_ORIENTATION_XXX
+	int  iSLSGridMask;
+
+/// The grid bit value identifies the display status. Refer to ADL_DISPLAY_SLSGRID_ORIENTATION_XXX
+	int  iSLSGridValue;
+
+} ADLSLSGrid, *LPADLSLSGrid;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display SLS Map information.
+///
+/// This structure is used to store the display SLS Map information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct	ADLSLSMap
+{
+	/// The Adapter Index
+	int iAdapterIndex;
+
+	/// The current display map index. It is the OS Desktop index. For example, OS Index 1 showing clone mode. The Display Map will be 1.
+    int iSLSMapIndex;
+
+	/// Indicate the current grid
+    ADLSLSGrid grid;
+
+	/// OS surface index
+	int  iSurfaceMapIndex;
+
+	 ///  Screen orientation. E.g., 0, 90, 180, 270
+     int iOrientation;
+
+	/// The number of display targets belongs to this map
+    int  iNumSLSTarget;
+
+	/// The first target array index in the Target array
+    int  iFirstSLSTargetArrayIndex;
+
+	/// The number of native modes belongs to this map
+	int  iNumNativeMode;
+
+	/// The first native mode array index in the native mode array
+    int  iFirstNativeModeArrayIndex;
+
+	/// The number of bezel modes belongs to this map
+	int  iNumBezelMode;
+
+	/// The first bezel mode array index in the native mode array
+    int  iFirstBezelModeArrayIndex;
+
+	/// The number of bezel offsets belongs to this map
+	int  iNumBezelOffset;
+
+	/// The first bezel offset array index in the
+    int  iFirstBezelOffsetArrayIndex;
+
+	/// The bit mask identifies the number of bits DisplayMap is currently using. Sum all the bit definitions defined in ADL_DISPLAY_SLSMAP_XXX.
+    int  iSLSMapMask;
+
+	/// The bit mask identifies the display map status. Refer to ADL_DISPLAY_SLSMAP_XXX
+    int  iSLSMapValue;
+
+
+} ADLSLSMap, *LPADLSLSMap;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display SLS Offset information.
+///
+/// This structure is used to store the display SLS Offset information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSLSOffset
+{
+	/// The Adapter Index
+	int iAdapterIndex;
+
+	/// The current display map index. It is the OS Desktop index. For example, OS Index 1 showing clone mode. The Display Map will be 1.
+    int iSLSMapIndex;
+
+	/// The Display ID.
+	ADLDisplayID displayID;
+
+	/// SLS Bezel Mode Index
+	int iBezelModeIndex;
+
+	/// SLS Bezel Offset X
+	int iBezelOffsetX;
+
+	/// SLS Bezel Offset Y
+	int iBezelOffsetY;
+
+	/// SLS Display Width
+	int iDisplayWidth;
+
+	/// SLS Display Height
+	int iDisplayHeight;
+
+	/// The bit mask identifies the number of bits Offset is currently using.
+	int iBezelOffsetMask;
+
+	/// The bit mask identifies the display status.
+	int  iBezelffsetValue;
+} ADLSLSOffset, *LPADLSLSOffset;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display SLS Mode information.
+///
+/// This structure is used to store the display SLS Mode information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSLSMode
+{
+	/// The Adapter Index
+	int iAdapterIndex;
+
+	/// The current display map index. It is the OS Desktop index. For example, OS Index 1 showing clone mode. The Display Map will be 1.
+    int iSLSMapIndex;
+
+	/// The mode index
+	int iSLSModeIndex;
+
+	/// The mode for this map.
+    ADLMode displayMode;
+
+	/// The bit mask identifies the number of bits Mode is currently using.
+    int iSLSNativeModeMask;
+
+	/// The bit mask identifies the display status.
+	int iSLSNativeModeValue;
+} ADLSLSMode, *LPADLSLSMode;
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the display Possible SLS Map information.
+///
+/// This structure is used to store the display Possible SLS Map information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPossibleSLSMap
+{
+	/// The current display map index. It is the OS Desktop index.
+	/// For example, OS Index 1 showing clone mode. The Display Map will be 1.
+    int iSLSMapIndex;
+
+	/// Number of display map to be validated.
+    int iNumSLSMap;
+
+	/// The display map list for validation
+    ADLSLSMap* lpSLSMap;
+
+	/// the number of display map config to be validated.
+    int iNumSLSTarget;
+
+	/// The display target list for validation.
+    ADLDisplayTarget* lpDisplayTarget;
+} ADLPossibleSLSMap, *LPADLPossibleSLSMap;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the SLS targets.
+///
+/// This structure is used to store the SLS targets information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSLSTarget
+{
+	/// the logic adapter index
+    int iAdapterIndex;
+
+	/// The SLS map index
+    int iSLSMapIndex;
+
+	/// The target ID
+    ADLDisplayTarget displayTarget;
+
+	/// Target postion X in SLS grid
+	int iSLSGridPositionX;
+
+	/// Target postion Y in SLS grid
+    int iSLSGridPositionY;
+
+	/// The view size width, height and rotation angle per SLS Target
+	ADLMode viewSize;
+
+	/// The bit mask identifies the bits in iSLSTargetValue are currently used
+    int iSLSTargetMask;
+
+	/// The bit mask identifies status info. It is for function extension purpose
+    int iSLSTargetValue;
+
+} ADLSLSTarget, *LPADLSLSTarget;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the Adapter offset stepping size.
+///
+/// This structure is used to store the Adapter offset stepping size information.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLBezelOffsetSteppingSize
+{
+	/// the logic adapter index
+    int iAdapterIndex;
+
+	/// The SLS map index
+    int iSLSMapIndex;
+
+	/// Bezel X stepping size offset
+	int iBezelOffsetSteppingSizeX;
+
+	/// Bezel Y stepping size offset
+	int iBezelOffsetSteppingSizeY;
+
+	/// Identifies the bits this structure is currently using. It will be the total OR of all the bit definitions.
+	int iBezelOffsetSteppingSizeMask;
+
+	/// Bit mask identifies the display status.
+	int iBezelOffsetSteppingSizeValue;
+
+} ADLBezelOffsetSteppingSize, *LPADLBezelOffsetSteppingSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about the overlap offset info for all the displays for each SLS mode.
+///
+/// This structure is used to store the no. of overlapped modes for each SLS Mode once user finishes the configuration from Overlap Widget
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSLSOverlappedMode
+{
+	/// the SLS mode for which the overlap is configured
+	ADLMode SLSMode;
+	/// the number of target displays in SLS.
+	int iNumSLSTarget;
+    /// the first target array index in the target array
+	int iFirstTargetArrayIndex;
+}ADLSLSTargetOverlap, *LPADLSLSTargetOverlap;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about driver supported PowerExpress Config Caps
+///
+/// This structure is used to store the driver supported PowerExpress Config Caps
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPXConfigCaps
+{
+    /// The Persistent logical Adapter Index.
+    int iAdapterIndex;
+
+    /// The bit mask identifies the number of bits PowerExpress Config Caps is currently using. It is the sum of all the bit definitions defined in ADL_PX_CONFIGCAPS_XXXX /ref define_powerxpress_constants.
+    int  iPXConfigCapMask;
+
+    /// The bit mask identifies the PowerExpress Config Caps value. The detailed definition is in ADL_PX_CONFIGCAPS_XXXX /ref define_powerxpress_constants.
+    int  iPXConfigCapValue;
+
+} ADLPXConfigCaps, *LPADLPXConfigCaps;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an application
+///
+/// This structure is used to store basic information of an application
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLApplicationData
+{
+	/// Path Name
+	char strPathName[ADL_MAX_PATH];
+	/// File Name
+	char strFileName[ADL_APP_PROFILE_FILENAME_LENGTH];
+	/// Creation timestamp
+	char strTimeStamp[ADL_APP_PROFILE_TIMESTAMP_LENGTH];
+	/// Version
+	char strVersion[ADL_APP_PROFILE_VERSION_LENGTH];
+}ADLApplicationData;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an application
+///
+/// This structure is used to store basic information of an application
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLApplicationDataX2
+{
+	/// Path Name
+	wchar_t strPathName[ADL_MAX_PATH];
+	/// File Name
+	wchar_t strFileName[ADL_APP_PROFILE_FILENAME_LENGTH];
+	/// Creation timestamp
+	wchar_t strTimeStamp[ADL_APP_PROFILE_TIMESTAMP_LENGTH];
+	/// Version
+	wchar_t strVersion[ADL_APP_PROFILE_VERSION_LENGTH];
+}ADLApplicationDataX2;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an application
+///
+/// This structure is used to store basic information of an application including process id
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLApplicationDataX3
+{
+    /// Path Name
+    wchar_t strPathName[ADL_MAX_PATH];
+    /// File Name
+    wchar_t strFileName[ADL_APP_PROFILE_FILENAME_LENGTH];
+    /// Creation timestamp
+    wchar_t strTimeStamp[ADL_APP_PROFILE_TIMESTAMP_LENGTH];
+    /// Version
+    wchar_t strVersion[ADL_APP_PROFILE_VERSION_LENGTH];
+    //Application Process id
+    unsigned int iProcessId;
+}ADLApplicationDataX3;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information of a property of an application profile
+///
+/// This structure is used to store property information of an application profile
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _PropertyRecord
+{
+	/// Property Name
+	char strName [ADL_APP_PROFILE_PROPERTY_LENGTH];
+	/// Property Type
+	ADLProfilePropertyType eType;
+	/// Data Size in bytes
+	int iDataSize;
+	/// Property Value, can be any data type
+	unsigned char uData[1];
+}PropertyRecord;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an application profile
+///
+/// This structure is used to store information of an application profile
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLApplicationProfile
+{
+	/// Number of properties
+	int iCount;
+	/// Buffer to store all property records
+	PropertyRecord record[1];
+}ADLApplicationProfile;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an OD5 Power Control feature
+///
+/// This structure is used to store information of an Power Control feature
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPowerControlInfo
+{
+/// Minimum value.
+int iMinValue;
+/// Maximum value.
+int iMaxValue;
+/// The minimum change in between minValue and maxValue.
+int iStepValue;
+ } ADLPowerControlInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an controller mode
+///
+/// This structure is used to store information of an controller mode
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLControllerMode
+{
+    /// This falg indicates actions that will be applied by set viewport
+    /// The value can be a combination of ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_POSITION,
+    /// ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_PANLOCK and ADL_CONTROLLERMODE_CM_MODIFIER_VIEW_SIZE
+    int iModifiers;
+
+    /// Horizontal view starting position
+    int iViewPositionCx;
+
+    /// Vertical view starting position
+    int iViewPositionCy;
+
+    /// Horizontal left panlock position
+    int iViewPanLockLeft;
+
+    /// Horizontal right panlock position
+    int iViewPanLockRight;
+
+    /// Vertical top panlock position
+    int iViewPanLockTop;
+
+    /// Vertical bottom panlock position
+    int iViewPanLockBottom;
+
+    /// View resolution in pixels (width)
+    int iViewResolutionCx;
+
+    /// View resolution in pixels (hight)
+    int iViewResolutionCy;
+}ADLControllerMode;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about a display
+///
+/// This structure is used to store information about a display
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayIdentifier
+{
+    /// ADL display index
+    long ulDisplayIndex;
+
+    /// manufacturer ID of the display
+    long ulManufacturerId;
+
+    /// product ID of the display
+    long ulProductId;
+
+    /// serial number of the display
+    long ulSerialNo;
+
+} ADLDisplayIdentifier;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 clock range
+///
+/// This structure is used to store information about Overdrive 6 clock range
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6ParameterRange
+{
+    /// The starting value of the clock range
+    int 	iMin;
+    /// The ending value of the clock range
+    int 	iMax;
+    /// The minimum increment between clock values
+    int 	iStep;
+
+} ADLOD6ParameterRange;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 capabilities
+///
+/// This structure is used to store information about Overdrive 6 capabilities
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6Capabilities
+{
+    /// Contains a bitmap of the OD6 capability flags.  Possible values: \ref ADL_OD6_CAPABILITY_SCLK_CUSTOMIZATION,
+    /// \ref ADL_OD6_CAPABILITY_MCLK_CUSTOMIZATION, \ref ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR
+    int 	iCapabilities;
+    /// Contains a bitmap indicating the power states
+    /// supported by OD6.  Currently only the performance state
+    /// is supported. Possible Values: \ref ADL_OD6_SUPPORTEDSTATE_PERFORMANCE
+    int 	iSupportedStates;
+    /// Number of levels. OD6 will always use 2 levels, which describe
+    /// the minimum to maximum clock ranges.
+    /// The 1st level indicates the minimum clocks, and the 2nd level
+    /// indicates the maximum clocks.
+    int     iNumberOfPerformanceLevels;
+    /// Contains the hard limits of the sclk range.  Overdrive
+    /// clocks cannot be set outside this range.
+    ADLOD6ParameterRange 	sEngineClockRange;
+    /// Contains the hard limits of the mclk range.  Overdrive
+    /// clocks cannot be set outside this range.
+    ADLOD6ParameterRange 	sMemoryClockRange;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6Capabilities;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 clock values.
+///
+/// This structure is used to store information about Overdrive 6 clock values.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6PerformanceLevel
+{
+    /// Engine (core) clock.
+    int iEngineClock;
+    /// Memory clock.
+    int iMemoryClock;
+
+} ADLOD6PerformanceLevel;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 clocks.
+///
+/// This structure is used to store information about Overdrive 6 clocks.  This is a
+/// variable-sized structure.  iNumberOfPerformanceLevels indicate how many elements
+/// are contained in the aLevels array.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6StateInfo
+{
+    /// Number of levels.  OD6 uses clock ranges instead of discrete performance levels.
+    /// iNumberOfPerformanceLevels is always 2.  The 1st level indicates the minimum clocks
+    /// in the range.  The 2nd level indicates the maximum clocks in the range.
+    int     iNumberOfPerformanceLevels;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+    /// Variable-sized array of levels.
+    /// The number of elements in the array is specified by iNumberofPerformanceLevels.
+    ADLOD6PerformanceLevel aLevels [1];
+
+} ADLOD6StateInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about current Overdrive 6 performance status.
+///
+/// This structure is used to store information about current Overdrive 6 performance status.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6CurrentStatus
+{
+    /// Current engine clock in 10 KHz.
+    int 	iEngineClock;
+    /// Current memory clock in 10 KHz.
+    int 	iMemoryClock;
+    /// Current GPU activity in percent.  This
+    /// indicates how "busy" the GPU is.
+    int 	iActivityPercent;
+    /// Not used.  Reserved for future use.
+    int 	iCurrentPerformanceLevel;
+    /// Current PCI-E bus speed
+    int 	iCurrentBusSpeed;
+    /// Current PCI-E bus # of lanes
+    int 	iCurrentBusLanes;
+    /// Maximum possible PCI-E bus # of lanes
+    int 	iMaximumBusLanes;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6CurrentStatus;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 thermal contoller capabilities
+///
+/// This structure is used to store information about Overdrive 6 thermal controller capabilities
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6ThermalControllerCaps
+{
+    /// Contains a bitmap of thermal controller capability flags. Possible values: \ref ADL_OD6_TCCAPS_THERMAL_CONTROLLER, \ref ADL_OD6_TCCAPS_FANSPEED_CONTROL,
+    /// \ref ADL_OD6_TCCAPS_FANSPEED_PERCENT_READ, \ref ADL_OD6_TCCAPS_FANSPEED_PERCENT_WRITE, \ref ADL_OD6_TCCAPS_FANSPEED_RPM_READ, \ref ADL_OD6_TCCAPS_FANSPEED_RPM_WRITE
+    int 	iCapabilities;
+    /// Minimum fan speed expressed as a percentage
+    int 	iFanMinPercent;
+    /// Maximum fan speed expressed as a percentage
+    int 	iFanMaxPercent;
+    /// Minimum fan speed expressed in revolutions-per-minute
+    int 	iFanMinRPM;
+    /// Maximum fan speed expressed in revolutions-per-minute
+    int 	iFanMaxRPM;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6ThermalControllerCaps;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 fan speed information
+///
+/// This structure is used to store information about Overdrive 6 fan speed information
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6FanSpeedInfo
+{
+    /// Contains a bitmap of the valid fan speed type flags.  Possible values: \ref ADL_OD6_FANSPEED_TYPE_PERCENT, \ref ADL_OD6_FANSPEED_TYPE_RPM, \ref ADL_OD6_FANSPEED_USER_DEFINED
+    int 	iSpeedType;
+    /// Contains current fan speed in percent (if valid flag exists in iSpeedType)
+    int 	iFanSpeedPercent;
+    /// Contains current fan speed in RPM (if valid flag exists in iSpeedType)
+    int	    iFanSpeedRPM;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6FanSpeedInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 fan speed value
+///
+/// This structure is used to store information about Overdrive 6 fan speed value
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6FanSpeedValue
+{
+    /// Indicates the units of the fan speed.  Possible values: \ref ADL_OD6_FANSPEED_TYPE_PERCENT, \ref ADL_OD6_FANSPEED_TYPE_RPM
+    int 	iSpeedType;
+    /// Fan speed value (units as indicated above)
+    int 	iFanSpeed;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6FanSpeedValue;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 PowerControl settings.
+///
+/// This structure is used to store information about Overdrive 6 PowerControl settings.
+/// PowerControl is the feature which allows the performance characteristics of the GPU
+/// to be adjusted by changing the PowerTune power limits.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6PowerControlInfo
+{
+    /// The minimum PowerControl adjustment value
+    int 	iMinValue;
+    /// The maximum PowerControl adjustment value
+    int 	iMaxValue;
+    /// The minimum difference between PowerControl adjustment values
+    int 	iStepValue;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6PowerControlInfo;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 PowerControl settings.
+///
+/// This structure is used to store information about Overdrive 6 PowerControl settings.
+/// PowerControl is the feature which allows the performance characteristics of the GPU
+/// to be adjusted by changing the PowerTune power limits.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6VoltageControlInfo
+{
+    /// The minimum VoltageControl adjustment value
+    int 	iMinValue;
+    /// The maximum VoltageControl adjustment value
+    int 	iMaxValue;
+    /// The minimum difference between VoltageControl adjustment values
+    int 	iStepValue;
+
+    /// Value for future extension
+    int     iExtValue;
+    /// Mask for future extension
+    int     iExtMask;
+
+} ADLOD6VoltageControlInfo;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing ECC statistics namely SEC counts and DED counts
+/// Single error count - count of errors that can be corrected
+/// Doubt Error Detect -  count of errors that cannot be corrected
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLECCData
+{
+	// Single error count - count of errors that can be corrected
+	int iSec;
+	// Double error detect - count of errors that cannot be corrected
+	int iDed;
+
+} ADLECCData;
+
+
+/// \brief Handle to ADL client context.
+///
+///  ADL clients obtain context handle from initial call to \ref ADL2_Main_Control_Create.
+///  Clients have to pass the handle to each subsequent ADL call and finally destroy
+///  the context with call to \ref ADL2_Main_Control_Destroy
+/// \nosubgrouping
+typedef void *ADL_CONTEXT_HANDLE;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the display mode definition used per controller.
+///
+/// This structure is used to store the display mode definition used per controller.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDisplayModeX2
+{
+/// Horizontal resolution (in pixels).
+   int  iWidth;
+/// Vertical resolution (in lines).
+   int  iHeight;
+/// Interlaced/Progressive. The value will be set for Interlaced as ADL_DL_TIMINGFLAG_INTERLACED. If not set it is progressive. Refer define_detailed_timing_flags.
+   int  iScanType;
+/// Refresh rate.
+   int  iRefreshRate;
+/// Timing Standard. Refer define_modetiming_standard.
+   int  iTimingStandard;
+} ADLDisplayModeX2;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 extension capabilities
+///
+/// This structure is used to store information about Overdrive 6 extension capabilities
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6CapabilitiesEx
+{
+    /// Contains a bitmap of the OD6 extension capability flags.  Possible values: \ref ADL_OD6_CAPABILITY_SCLK_CUSTOMIZATION,
+    /// \ref ADL_OD6_CAPABILITY_MCLK_CUSTOMIZATION, \ref ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR,
+    /// \ref ADL_OD6_CAPABILITY_POWER_CONTROL, \ref ADL_OD6_CAPABILITY_VOLTAGE_CONTROL, \ref ADL_OD6_CAPABILITY_PERCENT_ADJUSTMENT,
+    //// \ref ADL_OD6_CAPABILITY_THERMAL_LIMIT_UNLOCK
+    int iCapabilities;
+    /// The Power states that support clock and power customization.  Only performance state is currently supported.
+    /// Possible Values: \ref ADL_OD6_SUPPORTEDSTATE_PERFORMANCE
+    int iSupportedStates;
+    /// Returns the hard limits of the SCLK overdrive adjustment range.  Overdrive clocks should not be adjusted outside of this range.  The values are specified as +/- percentages.
+    ADLOD6ParameterRange sEngineClockPercent;
+    /// Returns the hard limits of the MCLK overdrive adjustment range.  Overdrive clocks should not be adjusted outside of this range.  The values are specified as +/- percentages.
+    ADLOD6ParameterRange sMemoryClockPercent;
+    /// Returns the hard limits of the Power Limit adjustment range.  Power limit should not be adjusted outside this range.  The values are specified as +/- percentages.
+    ADLOD6ParameterRange sPowerControlPercent;
+    /// Reserved for future expansion of the structure.
+    int iExtValue;
+    /// Reserved for future expansion of the structure.
+    int iExtMask;
+} ADLOD6CapabilitiesEx;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 extension state information
+///
+/// This structure is used to store information about Overdrive 6 extension state information
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6StateEx
+{
+    /// The current engine clock adjustment value, specified as a +/- percent.
+    int iEngineClockPercent;
+    /// The current memory clock adjustment value, specified as a +/- percent.
+    int iMemoryClockPercent;
+    /// The current power control adjustment value, specified as a +/- percent.
+    int iPowerControlPercent;
+    /// Reserved for future expansion of the structure.
+    int iExtValue;
+    /// Reserved for future expansion of the structure.
+    int iExtMask;
+} ADLOD6StateEx;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive 6 extension recommended maximum clock adjustment values
+///
+/// This structure is used to store information about Overdrive 6 extension recommended maximum clock adjustment values
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLOD6MaxClockAdjust
+{
+    /// The recommended maximum engine clock adjustment in percent, for the specified power limit value.
+    int iEngineClockMax;
+    /// The recommended maximum memory clock adjustment in percent, for the specified power limit value.
+    /// Currently the memory is independent of the Power Limit setting, so iMemoryClockMax will always return the maximum
+    /// possible adjustment value.  This field is here for future enhancement in case we add a dependency between Memory Clock
+    /// adjustment and Power Limit setting.
+    int iMemoryClockMax;
+    /// Reserved for future expansion of the structure.
+    int iExtValue;
+    /// Reserved for future expansion of the structure.
+    int iExtMask;
+} ADLOD6MaxClockAdjust;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the Connector information
+///
+/// this structure is used to get the connector information like length, positions & etc.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLConnectorInfo
+{
+	///index of the connector(0-based)
+	int iConnectorIndex;
+	///used for disply identification/ordering
+	int iConnectorId;
+	///index of the slot, 0-based index.
+	int iSlotIndex;
+	///Type of the connector. \ref define_connector_types
+	int iType;
+	///Position of the connector(in millimeters), from the right side of the slot.
+	int iOffset;
+	///Length of the connector(in millimeters).
+	int iLength;
+
+} ADLConnectorInfo;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing the slot information
+///
+/// this structure is used to get the slot information like length of the slot, no of connectors on the slot & etc.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLBracketSlotInfo
+{
+	///index of the slot, 0-based index.
+	int iSlotIndex;
+	///length of the slot(in millimeters).
+	int iLength;
+	///width of the slot(in millimeters).
+	int iWidth;
+} ADLBracketSlotInfo;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing MST branch information
+///
+/// this structure is used to store the MST branch information
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLMSTRad
+{
+	///depth of the link.
+	int iLinkNumber;
+	/// Relative address, address scheme starts from source side
+	char rad[ADL_MAX_RAD_LINK_COUNT];
+} ADLMSTRad;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing port information
+///
+/// this structure is used to get the display or MST branch information
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLDevicePort
+{
+	///index of the connector.
+	int iConnectorIndex;
+	///Relative MST address. If MST RAD contains 0 it means DP or Root of the MST topology. For non DP connectors MST RAD is ignored.
+	ADLMSTRad aMSTRad;
+} ADLDevicePort;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing supported connection types and properties
+///
+/// this structure is used to get the supported connection types and supported properties of given connector
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLSupportedConnections
+{
+	///Bit vector of supported connections. Bitmask is defined in constants section. \ref define_connection_types
+	int iSupportedConnections;
+	///Array of bitvectors. Each bit vector represents supported properties for one connection type. Index of this array is connection type (bit number in mask).
+	int iSupportedProperties[ADL_MAX_CONNECTION_TYPES];
+} ADLSupportedConnections;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing connection state of the connector
+///
+/// this structure is used to get the current Emulation status and mode of the given connector
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLConnectionState
+{
+	///The value is bit vector. Each bit represents status. See masks constants for details. \ref define_emulation_status
+	int iEmulationStatus;
+	///It contains information about current emulation mode. See constants for details. \ref define_emulation_mode
+	int iEmulationMode;
+	///If connection is active it will contain display id, otherwise CWDDEDI_INVALID_DISPLAY_INDEX
+	int iDisplayIndex;
+} ADLConnectionState;
+
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing connection properties information
+///
+/// this structure is used to retrieve the properties of connection type
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLConnectionProperties
+{
+	//Bit vector. Represents actual properties. Supported properties for specific connection type. \ref define_connection_properties
+	int iValidProperties;
+	//Bitrate(in MHz). Could be used for MST branch, DP or DP active dongle. \ref define_linkrate_constants
+	int iBitrate;
+	//Number of lanes in DP connection. \ref define_lanecount_constants
+	int iNumberOfLanes;
+	//Color depth(in bits). \ref define_colordepth_constants
+	int iColorDepth;
+	//3D capabilities. It could be used for some dongles. For instance: alternate framepack. Value of this property is bit vector.
+	int iStereo3DCaps;
+	///Output Bandwidth. Could be used for MST branch, DP or DP Active dongle. \ref define_linkrate_constants
+	int iOutputBandwidth;
+} ADLConnectionProperties;
+
+////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing connection information
+///
+/// this structure is used to retrieve the data from driver which includes
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLConnectionData
+{
+	///Connection type. based on the connection type either iNumberofPorts or IDataSize,EDIDdata is valid, \ref define_connection_types
+	int iConnectionType;
+	///Specifies the connection properties.
+	ADLConnectionProperties aConnectionProperties;
+	///Number of ports
+	int iNumberofPorts;
+	///Number of Active Connections
+	int iActiveConnections;
+	///actual size of EDID data block size.
+	int iDataSize;
+	///EDID Data
+	char EdidData[ADL_MAX_DISPLAY_EDID_DATA_SIZE];
+} ADLConnectionData;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about an controller mode including Number of Connectors
+///
+/// This structure is used to store information of an controller mode
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLAdapterCapsX2
+{
+	/// AdapterID for this adapter
+	int iAdapterID;
+	/// Number of controllers for this adapter
+	int iNumControllers;
+	/// Number of displays for this adapter
+	int iNumDisplays;
+	/// Number of overlays for this adapter
+	int iNumOverlays;
+	/// Number of GLSyncConnectors
+	int iNumOfGLSyncConnectors;
+	/// The bit mask identifies the adapter caps
+	int iCapsMask;
+	/// The bit identifies the adapter caps \ref define_adapter_caps
+	int iCapsValue;
+	/// Number of Connectors for this adapter
+	int iNumConnectors;
+}ADLAdapterCapsX2;
+
+typedef enum _ADL_ERROR_RECORD_SEVERITY
+{
+    ADL_GLOBALLY_UNCORRECTED  = 1,
+    ADL_LOCALLY_UNCORRECTED   = 2,
+    ADL_DEFFERRED             = 3,
+    ADL_CORRECTED             = 4
+}ADL_ERROR_RECORD_SEVERITY;
+
+typedef union _ADL_ECC_EDC_FLAG
+{
+    struct
+    {
+        unsigned int isEccAccessing        : 1;
+        unsigned int reserved              : 31;
+    }bits;
+    unsigned int u32All;
+}ADL_ECC_EDC_FLAG;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about EDC Error Record
+///
+/// This structure is used to store EDC Error Record
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLErrorRecord
+{
+    // Severity of error
+    ADL_ERROR_RECORD_SEVERITY Severity;
+
+    // Is the counter valid?
+    int  countValid;
+
+    // Counter value, if valid
+    unsigned int count;
+
+    // Is the location information valid?
+    int locationValid;
+
+    // Physical location of error
+    unsigned int CU; // CU number on which error occurred, if known
+    char StructureName[32]; // e.g. LDS, TCC, etc.
+
+    // Time of error record creation (e.g. time of query, or time of poison interrupt)
+    char tiestamp[32];
+
+    unsigned int padding[3];
+}ADLErrorRecord;
+
+typedef enum _ADL_EDC_BLOCK_ID
+{
+    ADL_EDC_BLOCK_ID_SQCIS = 1,
+    ADL_EDC_BLOCK_ID_SQCDS = 2,
+    ADL_EDC_BLOCK_ID_SGPR  = 3,
+    ADL_EDC_BLOCK_ID_VGPR  = 4,
+    ADL_EDC_BLOCK_ID_LDS   = 5,
+    ADL_EDC_BLOCK_ID_GDS   = 6,
+    ADL_EDC_BLOCK_ID_TCL1  = 7,
+    ADL_EDC_BLOCK_ID_TCL2  = 8
+}ADL_EDC_BLOCK_ID;
+
+typedef enum _ADL_ERROR_INJECTION_MODE
+{
+    ADL_ERROR_INJECTION_MODE_SINGLE      = 1,
+    ADL_ERROR_INJECTION_MODE_MULTIPLE    = 2,
+    ADL_ERROR_INJECTION_MODE_ADDRESS     = 3
+}ADL_ERROR_INJECTION_MODE;
+
+typedef union _ADL_ERROR_PATTERN
+{
+    struct
+    {
+        unsigned long  EccInjVector         :  16;
+        unsigned long  EccInjEn             :  9;
+        unsigned long  EccBeatEn            :  4;
+        unsigned long  EccChEn              :  4;
+        unsigned long  reserved             :  31;
+    } bits;
+    unsigned long long u64Value;
+} ADL_ERROR_PATTERN;
+
+typedef struct _ADL_ERROR_INJECTION_DATA
+{
+    unsigned long long errorAddress;
+    ADL_ERROR_PATTERN errorPattern;
+}ADL_ERROR_INJECTION_DATA;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about EDC Error Injection
+///
+/// This structure is used to store EDC Error Injection
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLErrorInjection
+{
+    ADL_EDC_BLOCK_ID blockId;
+    ADL_ERROR_INJECTION_MODE errorInjectionMode;
+}ADLErrorInjection;
+
+typedef struct ADLErrorInjectionX2
+{
+    ADL_EDC_BLOCK_ID blockId;
+    ADL_ERROR_INJECTION_MODE errorInjectionMode;
+    ADL_ERROR_INJECTION_DATA errorInjectionData;
+}ADLErrorInjectionX2;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing per display FreeSync capability information.
+///
+/// This structure is used to store the FreeSync capability of both the display and
+/// the GPU the display is connected to.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLFreeSyncCap
+{
+    /// FreeSync capability flags. \ref define_freesync_caps
+    int iCaps;
+    /// Reports minimum FreeSync refresh rate supported by the display in micro hertz
+    int iMinRefreshRateInMicroHz;
+    /// Reports maximum FreeSync refresh rate supported by the display in micro hertz
+    int iMaxRefreshRateInMicroHz;
+    /// Reserved
+    int iReserved[5];
+} ADLFreeSyncCap;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing per display Display Connectivty Experience Settings
+///
+/// This structure is used to store the Display Connectivity Experience settings of a
+/// display
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLDceSettings
+{
+    DceSettingsType type;                       // Defines which structure is in the union below
+    union
+    {
+        struct
+        {
+            bool qualityDetectionEnabled;
+        } HdmiLq;
+        struct
+        {
+            DpLinkRate linkRate;                // Read-only
+            unsigned int numberOfActiveLanes;   // Read-only
+            unsigned int numberofTotalLanes;    // Read-only
+            int relativePreEmphasis;            // Allowable values are -2 to +2
+            int relativeVoltageSwing;           // Allowable values are -2 to +2
+			int persistFlag;					
+        } DpLink;
+        struct
+        {
+            bool linkProtectionEnabled;         // Read-only
+        } Protection;
+    } Settings;
+    int iReserved[15];
+} ADLDceSettings;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Graphic Core
+///
+/// This structure is used to get Graphic Core Info
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLGraphicCoreInfo
+{
+    /// indicate the graphic core generation
+    int iGCGen;
+
+    /// Total number of CUs. Valid for GCN (iGCGen == GCN)
+    int iNumCUs;
+
+    /// Number of processing elements per CU. Valid for GCN (iGCGen == GCN)
+    int iNumPEsPerCU;
+
+    /// Total number of SIMDs. Valid for Pre GCN (iGCGen == Pre-GCN)
+    int iNumSIMDs;
+
+    /// Total number of ROPs. Valid for both GCN and Pre GCN
+    int iNumROPs;
+
+    /// reserved for future use
+    int iReserved[11];
+}ADLGraphicCoreInfo;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive N clock range
+///
+/// This structure is used to store information about Overdrive N clock range
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLODNParameterRange
+{
+	/// The starting value of the clock range
+	int 	iMode;
+	/// The starting value of the clock range
+	int 	iMin;
+	/// The ending value of the clock range
+	int 	iMax;
+	/// The minimum increment between clock values
+	int 	iStep;
+	/// The default clock values
+	int 	iDefault;
+
+} ADLODNParameterRange;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive N capabilities
+///
+/// This structure is used to store information about Overdrive N capabilities
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct _ADLODNCapabilities
+{
+	/// Number of levels which describe the minimum to maximum clock ranges.
+	/// The 1st level indicates the minimum clocks, and the 2nd level
+	/// indicates the maximum clocks.
+	int     iMaximumNumberOfPerformanceLevels;
+	/// Contains the hard limits of the sclk range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	sEngineClockRange;
+	/// Contains the hard limits of the mclk range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	sMemoryClockRange;
+	/// Contains the hard limits of the vddc range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	svddcRange;
+	/// Contains the hard limits of the power range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	power;
+	/// Contains the hard limits of the power range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	powerTuneTemperature;
+	/// Contains the hard limits of the Temperature range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	fanTemperature;
+	/// Contains the hard limits of the Fan range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	fanSpeed;
+	/// Contains the hard limits of the Fan range.  Overdrive
+	/// clocks cannot be set outside this range.
+	ADLODNParameterRange 	minimumPerformanceClock;
+} ADLODNCapabilities;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive level.
+///
+/// This structure is used to store information about Overdrive level.
+/// This structure is used by ADLODPerformanceLevels.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODNPerformanceLevel
+{
+	/// clock.
+	int iClock;
+	/// VDCC.
+	int iVddc;
+	/// enabled
+	int iEnabled;
+} ADLODNPerformanceLevel;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive N performance levels.
+///
+/// This structure is used to store information about Overdrive performance levels.
+/// This structure is used by the ADL_OverdriveN_ODPerformanceLevels_Get() and ADL_OverdriveN_ODPerformanceLevels_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODNPerformanceLevels
+{
+	int iSize;
+	//Automatic/manual
+	int iMode;
+	/// Must be set to sizeof( \ref ADLODPerformanceLevels ) + sizeof( \ref ADLODPerformanceLevel ) * (ADLODParameters.iNumberOfPerformanceLevels - 1)
+	int iNumberOfPerformanceLevels;
+	/// Array of performance state descriptors. Must have ADLODParameters.iNumberOfPerformanceLevels elements.
+	ADLODNPerformanceLevel aLevels[1];
+} ADLODNPerformanceLevels;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive N Fan Speed.
+///
+/// This structure is used to store information about Overdrive Fan control .
+/// This structure is used by the ADL_OverdriveN_ODPerformanceLevels_Get() and ADL_OverdriveN_ODPerformanceLevels_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODNFanControl
+{
+	int iMode;
+	int iFanControlMode;
+	int iCurrentFanSpeedMode;
+	int iCurrentFanSpeed;
+	int iTargetFanSpeed;
+	int iTargetTemperature;
+	int iMinPerformanceClock;
+	int iMinFanLimit;
+} ADLODNFanControl;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about Overdrive N power limit.
+///
+/// This structure is used to store information about Overdrive power limit.
+/// This structure is used by the ADL_OverdriveN_ODPerformanceLevels_Get() and ADL_OverdriveN_ODPerformanceLevels_Set() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLODNPowerLimitSetting
+{
+	int iMode;
+	int iTDPLimit;
+	int iMaxOperatingTemperature;
+} ADLODNPowerLimitSetting;
+
+typedef struct ADLODNPerformanceStatus
+{
+	int iCoreClock;
+	int iMemoryClock;
+	int iDCEFClock;
+	int iGFXClock;
+	int iUVDClock;
+	int iVCEClock;
+	int iGPUActivityPercent;
+	int iCurrentCorePerformanceLevel;
+	int iCurrentMemoryPerformanceLevel;
+	int iCurrentDCEFPerformanceLevel;
+	int iCurrentGFXPerformanceLevel;
+	int iUVDPerformanceLevel;
+	int iVCEPerformanceLevel;
+	int iCurrentBusSpeed;
+	int iCurrentBusLanes;
+	int iMaximumBusLanes;
+	int iVDDC;
+	int iVDDCI;
+} ADLODNPerformanceStatus;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+///\brief Structure containing information about PPLog settings.
+///
+/// This structure is used to store information about PPLog settings.
+/// This structure is used by the ADL2_PPLogSettings_Set() and ADL2_PPLogSettings_Get() functions.
+/// \nosubgrouping
+////////////////////////////////////////////////////////////////////////////////////////////
+typedef struct ADLPPLogSettings
+{
+    int BreakOnAssert;
+    int BreakOnWarn;
+    int LogEnabled;
+    int LogFieldMask;
+    int LogDestinations;
+    int LogSeverityEnabled;
+    int LogSourceMask;
+    int PowerProfilingEnabled;
+    int PowerProfilingTimeInterval;
+}ADLPPLogSettings;
+
+#endif /* ADL_STRUCTURES_H_ */
+
diff --git a/calibrate.go b/calibrate.go
index e6ff001..c3a1d50 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -1,6 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
-// +build opencl,!cuda
+// +build !cuda
 
 package main
 
diff --git a/cladldevice.go b/cladldevice.go
new file mode 100644
index 0000000..ced0df9
--- /dev/null
+++ b/cladldevice.go
@@ -0,0 +1,553 @@
+// Copyright (c) 2016 The Decred developers.
+
+// +build opencladl,!cuda,!opencl
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/decred/gominer/adl"
+	"github.com/decred/gominer/cl"
+	"github.com/decred/gominer/util"
+	"github.com/decred/gominer/work"
+)
+
+// Return the GPU library in use.
+func gpuLib() string {
+	return "OpenCL ADL"
+}
+
+const (
+	outputBufferSize = cl.CL_size_t(64)
+	localWorksize    = 64
+	uint32Size       = cl.CL_size_t(unsafe.Sizeof(cl.CL_uint(0)))
+)
+
+var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
+
+func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
+	var programBuffer [1][]byte
+	var programSize [1]cl.CL_size_t
+
+	// Read each program file and place content into buffer array.
+	programHandle, err := os.Open(filename)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer programHandle.Close()
+
+	buf := bytes.NewBuffer(nil)
+	_, err = io.Copy(buf, programHandle)
+	if err != nil {
+		return nil, nil, err
+	}
+	str := string(buf.Bytes())
+	programFinal := []byte(str)
+
+	programSize[0] = cl.CL_size_t(len(programFinal))
+	programBuffer[0] = make([]byte, programSize[0])
+	for i := range programFinal {
+		programBuffer[0][i] = programFinal[i]
+	}
+
+	return programBuffer[:], programSize[:], nil
+}
+
+func clError(status cl.CL_int, f string) error {
+	if -status < 0 || int(-status) > len(cl.ERROR_CODES_STRINGS) {
+		return fmt.Errorf("returned unknown error")
+	}
+
+	return fmt.Errorf("%s returned error %s (%d)", f,
+		cl.ERROR_CODES_STRINGS[-status], status)
+}
+
+type Device struct {
+	// The following variables must only be used atomically.
+	fanPercent  uint32
+	temperature uint32
+
+	sync.Mutex
+	index int
+	cuda  bool
+
+	// Items for OpenCL device
+	platformID    cl.CL_platform_id
+	deviceID      cl.CL_device_id
+	deviceName    string
+	context       cl.CL_context
+	queue         cl.CL_command_queue
+	outputBuffer  cl.CL_mem
+	program       cl.CL_program
+	kernel        cl.CL_kernel
+	fanTempActive bool
+	kind          string
+
+	//cuInput        cu.DevicePtr
+	cuInSize       int64
+	cuOutputBuffer []float64
+
+	workSize uint32
+
+	// extraNonce is the device extraNonce, where the first
+	// byte is the device ID (supporting up to 255 devices)
+	// while the last 3 bytes is the extraNonce value. If
+	// the extraNonce goes through all 0x??FFFFFF values,
+	// it will reset to 0x??000000.
+	extraNonce    uint32
+	currentWorkID uint32
+
+	midstate  [8]uint32
+	lastBlock [16]uint32
+
+	work     work.Work
+	newWork  chan *work.Work
+	workDone chan []byte
+	hasWork  bool
+
+	started          uint32
+	allDiffOneShares uint64
+	validShares      uint64
+	invalidShares    uint64
+
+	quit chan struct{}
+}
+
+func deviceStats(index int) (uint32, uint32) {
+	fanPercent := uint32(0)
+	temperature := uint32(0)
+	tempDivisor := uint32(1000)
+
+	fanPercent = adl.DeviceFanPercent(index)
+	temperature = adl.DeviceTemperature(index) / tempDivisor
+
+	return fanPercent, temperature
+}
+
+func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
+	var platformID cl.CL_platform_id
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		return platformID, nil, fmt.Errorf("Could not get CL platforms: %v", err)
+	}
+	platformID = platformIDs[0]
+	CLdeviceIDs, err := getCLDevices(platformID)
+	if err != nil {
+		return platformID, nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+	}
+	return platformID, CLdeviceIDs, nil
+}
+
+func getCLPlatforms() ([]cl.CL_platform_id, error) {
+	var numPlatforms cl.CL_uint
+	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	platforms := make([]cl.CL_platform_id, numPlatforms)
+	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetPlatformIDs")
+	}
+	return platforms, nil
+}
+
+// getCLDevices returns the list of devices for the given platform.
+func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
+	var numDevices cl.CL_uint
+	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
+		&numDevices)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	devices := make([]cl.CL_device_id, numDevices)
+	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
+		devices, nil)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLGetDeviceIDs")
+	}
+	return devices, nil
+}
+
+// ListDevices prints a list of devices present.
+func ListDevices() {
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Could not get CL platforms: %v\n", err)
+		os.Exit(1)
+	}
+
+	deviceListIndex := 0
+	for i := range platformIDs {
+		platformID := platformIDs[i]
+		deviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Could not get CL devices for platform: %v\n", err)
+			os.Exit(1)
+		}
+		for _, deviceID := range deviceIDs {
+			fmt.Printf("DEV #%d: %s\n", deviceListIndex, getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"))
+			deviceListIndex++
+		}
+
+	}
+}
+
+func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.CL_device_id,
+	workDone chan []byte) (*Device, error) {
+	d := &Device{
+		index:       index,
+		platformID:  platformID,
+		deviceID:    deviceID,
+		deviceName:  getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
+		kind:        "adl",
+		quit:        make(chan struct{}),
+		newWork:     make(chan *work.Work, 5),
+		workDone:    workDone,
+		fanPercent:  0,
+		temperature: 0,
+	}
+
+	var status cl.CL_int
+
+	// Create the CL context.
+	d.context = cl.CLCreateContext(nil, 1, []cl.CL_device_id{deviceID},
+		nil, nil, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateContext")
+	}
+
+	// Create the command queue.
+	d.queue = cl.CLCreateCommandQueue(d.context, deviceID, 0, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateCommandQueue")
+	}
+
+	// Create the output buffer.
+	d.outputBuffer = cl.CLCreateBuffer(d.context, cl.CL_MEM_READ_WRITE,
+		uint32Size*outputBufferSize, nil, &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateBuffer")
+	}
+
+	// Load kernel source.
+	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
+	if err != nil {
+		return nil, fmt.Errorf("Could not load kernel source: %v", err)
+	}
+
+	// Create the program.
+	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:],
+		progSize[:], &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateProgramWithSource")
+	}
+
+	// Build the program for the device.
+	compilerOptions := ""
+	compilerOptions += fmt.Sprintf(" -D WORKSIZE=%d", localWorksize)
+	status = cl.CLBuildProgram(d.program, 1, []cl.CL_device_id{deviceID},
+		[]byte(compilerOptions), nil, nil)
+	if status != cl.CL_SUCCESS {
+		err = clError(status, "CLBuildProgram")
+
+		// Something went wrong! Print what it is.
+		var logSize cl.CL_size_t
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, 0, nil, &logSize)
+		if status != cl.CL_SUCCESS {
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
+		}
+		var programLog interface{}
+		status = cl.CLGetProgramBuildInfo(d.program, deviceID,
+			cl.CL_PROGRAM_BUILD_LOG, logSize, &programLog, nil)
+		if status != cl.CL_SUCCESS {
+			minrLog.Errorf("Could not obtain compilation error log: %v",
+				clError(status, "CLGetProgramBuildInfo"))
+		}
+		minrLog.Errorf("%s\n", programLog)
+
+		return nil, err
+	}
+
+	// Create the kernel.
+	d.kernel = cl.CLCreateKernel(d.program, []byte("search"), &status)
+	if status != cl.CL_SUCCESS {
+		return nil, clError(status, "CLCreateKernel")
+	}
+
+	d.started = uint32(time.Now().Unix())
+
+	// Autocalibrate the desired work size for the kernel, or use one of the
+	// values passed explicitly by the use.
+	// The intensity or worksize must be set by the user.
+	userSetWorkSize := false
+	if len(cfg.IntensityInts) > 0 || len(cfg.WorkSizeInts) > 0 {
+		userSetWorkSize = true
+	}
+
+	var globalWorkSize uint32
+	if !userSetWorkSize {
+		// Apply the first setting as a global setting
+		calibrateTime := cfg.AutocalibrateInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.AutocalibrateInts {
+			if i == order {
+				calibrateTime = cfg.AutocalibrateInts[i]
+			}
+		}
+
+		idealWorkSize, err := d.calcWorkSizeForMilliseconds(calibrateTime)
+		if err != nil {
+			return nil, err
+		}
+
+		minrLog.Debugf("Autocalibration successful, work size for %v"+
+			"ms per kernel execution on device %v determined to be %v",
+			calibrateTime, d.index, idealWorkSize)
+
+		globalWorkSize = idealWorkSize
+	} else {
+		if len(cfg.IntensityInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = 1 << uint32(cfg.IntensityInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.IntensityInts {
+				if i == order {
+					globalWorkSize = 1 << uint32(cfg.IntensityInts[order])
+				}
+			}
+		}
+		if len(cfg.WorkSizeInts) > 0 {
+			// Apply the first setting as a global setting
+			globalWorkSize = uint32(cfg.WorkSizeInts[0])
+
+			// Override with the per-device setting if it exists
+			for i := range cfg.WorkSizeInts {
+				if i == order {
+					globalWorkSize = uint32(cfg.WorkSizeInts[order])
+				}
+			}
+
+		}
+	}
+	intensity := math.Log2(float64(globalWorkSize))
+	minrLog.Infof("DEV #%d: Work size set to %v ('intensity' %v)",
+		d.index, globalWorkSize, intensity)
+	d.workSize = globalWorkSize
+
+	fanPercent, temperature := deviceStats(d.index)
+	// Newer cards will idle with the fan off so just check if we got
+	// a good temperature reading
+	if temperature != 0 {
+		atomic.StoreUint32(&d.fanPercent, fanPercent)
+		atomic.StoreUint32(&d.temperature, temperature)
+		d.fanTempActive = true
+	}
+
+	return d, nil
+}
+
+func (d *Device) runDevice() error {
+	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
+	outputData := make([]uint32, outputBufferSize)
+
+	// Bump the extraNonce for the device it's running on
+	// when you begin mining. This ensures each device is doing
+	// different work. If the extraNonce has already been
+	// set for valid work, restore that.
+	d.extraNonce += uint32(d.index) << 24
+	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+	var status cl.CL_int
+	for {
+		d.updateCurrentWork()
+
+		select {
+		case <-d.quit:
+			return nil
+		default:
+		}
+
+		// Increment extraNonce.
+		util.RolloverExtraNonce(&d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+
+		// Update the timestamp. Only solo work allows you to roll
+		// the timestamp.
+		ts := d.work.JobTime
+		if d.work.IsGetWork {
+			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+			ts = d.work.JobTime + diffSeconds
+		}
+		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+
+		// arg 0: pointer to the buffer
+		obuf := d.outputBuffer
+		status = cl.CLSetKernelArg(d.kernel, 0,
+			cl.CL_size_t(unsafe.Sizeof(obuf)),
+			unsafe.Pointer(&obuf))
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLSetKernelArg")
+		}
+
+		// args 1..8: midstate
+		for i := 0; i < 8; i++ {
+			ms := d.midstate[i]
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+1),
+				uint32Size, unsafe.Pointer(&ms))
+			if status != cl.CL_SUCCESS {
+				return clError(status, "CLSetKernelArg")
+			}
+		}
+
+		// args 9..20: lastBlock except nonce
+		i2 := 0
+		for i := 0; i < 12; i++ {
+			if i2 == work.Nonce0Word {
+				i2++
+			}
+			lb := d.lastBlock[i2]
+			status = cl.CLSetKernelArg(d.kernel, cl.CL_uint(i+9),
+				uint32Size, unsafe.Pointer(&lb))
+			if status != cl.CL_SUCCESS {
+				return clError(status, "CLSetKernelArg")
+			}
+			i2++
+		}
+
+		// Clear the found count from the buffer
+		status = cl.CLEnqueueWriteBuffer(d.queue, d.outputBuffer,
+			cl.CL_FALSE, 0, uint32Size, unsafe.Pointer(&zeroSlice[0]),
+			0, nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueWriteBuffer")
+		}
+
+		// Execute the kernel and follow its execution time.
+		currentTime := time.Now()
+		var globalWorkSize [1]cl.CL_size_t
+		globalWorkSize[0] = cl.CL_size_t(d.workSize)
+		var localWorkSize [1]cl.CL_size_t
+		localWorkSize[0] = localWorksize
+		status = cl.CLEnqueueNDRangeKernel(d.queue, d.kernel, 1, nil,
+			globalWorkSize[:], localWorkSize[:], 0, nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueNDRangeKernel")
+		}
+
+		// Read the output buffer.
+		cl.CLEnqueueReadBuffer(d.queue, d.outputBuffer, cl.CL_TRUE, 0,
+			uint32Size*outputBufferSize, unsafe.Pointer(&outputData[0]), 0,
+			nil, nil)
+		if status != cl.CL_SUCCESS {
+			return clError(status, "CLEnqueueReadBuffer")
+		}
+
+		for i := uint32(0); i < outputData[0]; i++ {
+			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
+				"extraNonce %08x, workID %08x, timestamp %08x",
+				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
+				util.Uint32EndiannessSwap(d.currentWorkID),
+				d.lastBlock[work.TimestampWord])
+
+			// Assess the work. If it's below target, it'll be rejected
+			// here. The mining algorithm currently sends this function any
+			// difficulty 1 shares.
+			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
+				d.lastBlock[work.Nonce1Word])
+		}
+
+		elapsedTime := time.Since(currentTime)
+		minrLog.Tracef("DEV #%d: Kernel execution to read time: %v", d.index,
+			elapsedTime)
+	}
+}
+
+func newMinerDevs(m *Miner) (*Miner, int, error) {
+	deviceListIndex := 0
+	deviceListEnabledCount := 0
+
+	platformIDs, err := getCLPlatforms()
+	if err != nil {
+		return nil, 0, fmt.Errorf("Could not get CL platforms: %v", err)
+	}
+
+	for p := range platformIDs {
+		platformID := platformIDs[p]
+		CLdeviceIDs, err := getCLDevices(platformID)
+		if err != nil {
+			return nil, 0, fmt.Errorf("Could not get CL devices for platform: %v", err)
+		}
+
+		for _, CLdeviceID := range CLdeviceIDs {
+			miningAllowed := false
+
+			// Enforce device restrictions if they exist
+			if len(cfg.DeviceIDs) > 0 {
+				for _, i := range cfg.DeviceIDs {
+					if deviceListIndex == i {
+						miningAllowed = true
+					}
+				}
+			} else {
+				miningAllowed = true
+			}
+			if miningAllowed {
+				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
+				deviceListEnabledCount++
+				m.devices = append(m.devices, newDevice)
+				if err != nil {
+					return nil, 0, err
+				}
+			}
+			deviceListIndex++
+		}
+	}
+	return m, deviceListEnabledCount, nil
+
+}
+
+func getDeviceInfo(id cl.CL_device_id,
+	name cl.CL_device_info,
+	str string) string {
+
+	var errNum cl.CL_int
+	var paramValueSize cl.CL_size_t
+
+	errNum = cl.CLGetDeviceInfo(id, name, 0, nil, &paramValueSize)
+
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	var info interface{}
+	errNum = cl.CLGetDeviceInfo(id, name, paramValueSize, &info, nil)
+	if errNum != cl.CL_SUCCESS {
+		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
+	}
+
+	strinfo := fmt.Sprintf("%v", info)
+
+	return strinfo
+}
+
+func (d *Device) Release() {
+	cl.CLReleaseKernel(d.kernel)
+	cl.CLReleaseProgram(d.program)
+	cl.CLReleaseCommandQueue(d.queue)
+	cl.CLReleaseMemObject(d.outputBuffer)
+	cl.CLReleaseContext(d.context)
+}
diff --git a/cldevice.go b/cldevice.go
index f457e46..9209aa0 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -1,6 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
-// +build opencl,!cuda
+// +build opencl,!cuda,!opencladl
 
 package main
 
@@ -147,7 +147,7 @@ func determineDeviceKind(index int, deviceName string) string {
 	return deviceKind
 }
 
-func deviceInfo(index int) (uint32, uint32) {
+func deviceStats(index int) (uint32, uint32) {
 	basePath := "/sys/class/drm/card_I_/device/hwmon/"
 	basePath = strings.Replace(basePath, "_I_", strconv.Itoa(index), 1)
 	fanPercent := uint32(0)
@@ -177,15 +177,15 @@ func deviceInfo(index int) (uint32, uint32) {
 	pwmMax := uint32(255) // could read this from pwm1_max but it seems to be a constant
 	tempDivisor := uint32(1000)
 
-	fanPercent = deviceInfoReadSysfsEntry(hwmonPath + "pwm1")
+	fanPercent = deviceStatsReadSysfsEntry(hwmonPath + "pwm1")
 	fanPercentFloat := float64(fanPercent) / float64(pwmMax) * float64(100)
 	fanPercent = uint32(fanPercentFloat)
-	temperature = deviceInfoReadSysfsEntry(hwmonPath+"temp1_input") / tempDivisor
+	temperature = deviceStatsReadSysfsEntry(hwmonPath+"temp1_input") / tempDivisor
 
 	return fanPercent, temperature
 }
 
-func deviceInfoReadSysfsEntry(path string) uint32 {
+func deviceStatsReadSysfsEntry(path string) uint32 {
 	res := uint32(0)
 	dataRaw := ""
 
@@ -436,7 +436,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 
 	switch d.kind {
 	case "amdgpu":
-		fanPercent, temperature := deviceInfo(d.index)
+		fanPercent, temperature := deviceStats(d.index)
 		// Newer cards will idle with the fan off so just check if we got
 		// a good temperature reading
 		if temperature != 0 {
diff --git a/cudevice.go b/cudevice.go
index f9399a7..0ec7cbb 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -36,7 +36,7 @@ const (
 
 // Return the GPU library in use.
 func gpuLib() string {
-	return "Cuda"
+	return "CUDA"
 }
 
 const (
@@ -102,7 +102,7 @@ func decredHashNonce(gridx, blockx, threads uint32, startNonce uint32, nonceResu
 		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
 }
 
-func deviceInfo(index int) (uint32, uint32) {
+func deviceStats(index int) (uint32, uint32) {
 	fanPercent := uint32(0)
 	temperature := uint32(0)
 
@@ -211,7 +211,7 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 
 	d.cuInSize = 21
 
-	fanPercent, temperature := deviceInfo(d.index)
+	fanPercent, temperature := deviceStats(d.index)
 	// Newer cards will idle with the fan off so just check if we got
 	// a good temperature reading
 	if temperature != 0 {
diff --git a/device.go b/device.go
index b54cbae..4d97108 100644
--- a/device.go
+++ b/device.go
@@ -197,8 +197,8 @@ func (d *Device) UpdateFanTemp() {
 		// but could be split up later.  Anything else (Intel) just
 		// don't do anything.
 		switch d.kind {
-		case "amdgpu", "nvidia":
-			fanPercent, temperature := deviceInfo(d.index)
+		case "adl", "amdgpu", "nvidia":
+			fanPercent, temperature := deviceStats(d.index)
 			atomic.StoreUint32(&d.fanPercent, fanPercent)
 			atomic.StoreUint32(&d.temperature, temperature)
 			break

From f8e059e0e842c3024426e83189df4cdd937c6a95 Mon Sep 17 00:00:00 2001
From: Marco Peereboom <marco@peereboom.us>
Date: Wed, 21 Sep 2016 12:14:57 -0500
Subject: [PATCH 064/150] Implement CUDA on windows, Fixes #108

This also cleans up cgo usage across the entire project.
---
 GNUmakefile           | 55 ++++++++++++++++++++++++++++++---------
 README.windows        | 55 +++++++++++++++++++++++++++++++++++++++
 cgo_flags.go          | 11 ++++++++
 cl/buffer.go          | 13 +---------
 cl/cgo_flags.go       |  8 ++++++
 cl/cl.go              | 13 +---------
 cl/cl.h               | 14 ++++++++++
 cl/context.go         | 60 ++++++++++++++++++++++++-------------------
 cl/device.go          | 13 +---------
 cl/event.go           | 33 ++++++++++--------------
 cl/event11.go         | 14 +---------
 cl/image.go           | 13 +---------
 cl/image11.go         | 13 +---------
 cl/kernel.go          | 15 +----------
 cl/kernel1x.go        | 14 +---------
 cl/memory.go          | 24 +++++++----------
 cl/platform.go        | 13 +---------
 cl/program.go         | 35 ++++++++++---------------
 cl/program11.go       | 13 +---------
 cl/queue.go           | 13 +---------
 cl/queue1x.go         | 14 +---------
 cl/sampler.go         | 13 +---------
 cl/sampler1x.go       | 14 +---------
 cudakernel_static.go  | 26 +++++++++++++++++++
 cudakernel_windows.go | 27 +++++++++++++++++++
 cudevice.go           |  6 ++---
 decred.cu             | 14 +++++++---
 decred.h              | 18 +++++++++++++
 nvml/cgo_flags.go     |  7 +++++
 nvml/nvml.go          |  9 ++++---
 30 files changed, 311 insertions(+), 279 deletions(-)
 create mode 100644 README.windows
 create mode 100644 cgo_flags.go
 create mode 100644 cl/cgo_flags.go
 create mode 100644 cl/cl.h
 create mode 100644 cudakernel_static.go
 create mode 100644 cudakernel_windows.go
 create mode 100644 decred.h
 create mode 100644 nvml/cgo_flags.go

diff --git a/GNUmakefile b/GNUmakefile
index d161297..2224e12 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -2,27 +2,58 @@ CC ?= gcc
 CXX ?= g++
 NVCC ?= nvcc
 AR ?= ar
+# -o is gnu only so this needs to be smarter; it does work because on darwin it
+#  fails which is also not windows.
+ARCH:=$(shell uname -o)
 
 .DEFAULT_GOAL := build
 
+ifeq ($(ARCH),Msys)
+nvidia:
+endif
+
+# Windows needs additional setup and since cgo does not support spaces in
+# in include and library paths we copy it to the correct location.
+#
+# Windows build assumes that CUDA V7.0 is installed in its default location.
+#
+# Windows gominer requires nvml.dll and decred.dll to reside in the same
+# directory as gominer.exe.
+ifeq ($(ARCH),Msys)
+obj: nvidia
+	mkdir nvidia
+	cp -r /c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/* nvidia
+	cp -r /c/Program\ Files/NVIDIA\ Corporation/NVSMI nvidia
+else
 obj:
+endif
 	mkdir obj
 
-obj/blake.o: obj
-	$(CC) -c sph/blake.c -o obj/blake.o
-
-obj/decred.o: obj
-	$(NVCC) -I. -c decred.cu -o obj/decred.o
-
-obj/cuda.a: obj/blake.o obj/decred.o
-	$(AR) rvs obj/cuda.a obj/blake.o obj/decred.o
-
-build: obj/cuda.a
+ifeq ($(ARCH),Msys)
+obj/decred.dll: obj sph/blake.c decred.cu
+	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. -Isph decred.cu sph/blake.c -o obj/decred.dll
+else
+obj/decred.a: obj sph/blake.c decred.cu
+	$(NVCC) --lib --optimize=3 -I. decred.cu sph/blake.c -o obj/decred.a
+endif
+
+ifeq ($(ARCH),Msys)
+build: obj/decred.dll
+else
+build: obj/decred.a
+endif
 	go build -tags 'cuda'
 
-install: obj/cuda.a
-	go install
+ifeq ($(ARCH),Msys)
+install: obj/decred.dll
+else
+install: obj/decred.a
+endif
+	go install -tags 'cuda'
 
 clean:
 	rm -rf obj
 	go clean
+ifeq ($(ARCH),Msys)
+	rm -rf nvidia
+endif
diff --git a/README.windows b/README.windows
new file mode 100644
index 0000000..5911d2d
--- /dev/null
+++ b/README.windows
@@ -0,0 +1,55 @@
+Windows support for gominer requires very specific build steps.  Unlike Linux
+Windows requires dynamically linked libraries and some additional hoops to jump
+through.  Great pains went into making the build as simple as possible and
+unfortunately it is still complex and requires specific versions.  Effort went
+into using less painful toolchains but those ended all in failure.  It is
+advisable to install the toolchain on a fresh machines; leftovers are going to
+make stuff not work right.  Please accept the dark magic used here as gospel
+for any other way is a sin.
+
+========================================================================
+MAKE SURE THERE ARE NO OTHER GCC/BINUTILS IN THE PATH!
+MAKE SURE THERE ARE NO LEFTOVER INCLUDES AND LIBRARIES!
+ONLY USE MINGW64!
+YOU HAVE BEEN WARNED!
+========================================================================
+
+Note that the build system copies various directories into the relative path in
+order to work arround issues in cgo/windows.
+
+Compiling gominer on windows requires installation in following order:
+* Microsoft Visual Studio 2013 (cl.exe, used by nvcc)
+* NVIDIA CUDA V7.0 drivers (includes nvml.dll) and GPU toolkit (nvcc, headers, libs etc)
+* Official Go 1.7.1 installed in the default location and with GOPATH set
+* Git-Bash (for the shell and to check out code)
+* MingW64 (only mingw64 with defaults works!) http://sourceforge.net/projects/mingw-w64/
+
+* Setup environment (in advanced windows settings!):
+* ensure that cl.exe, go.exe and git.exe are in PATH
+* ensure that GOPATH is set somewhere sane (e.g. %HOMEPATH%\go)
+
+Install mumax/3 (in cmd.exe):
+$ go get github.com/mumax/3
+
+Install glide (in git-bash):
+$ go get github.com/Masterminds/glide
+
+Download gominer source (in git-bash; will fail compilation but that is ok):
+$ go get github.com/decred/gominer
+
+Setup vendoring for gominer
+$ cd $GOPATH/src/github.com/decred/gominer
+$ glide i
+
+Building CUDA (in git-bash):
+$ cd $GOPATH/src/github.com/decred/gominer
+$ mingw32-make.exe
+
+Distribution requires 3 files:
+* gominer.exe
+* decred.dll (relative path -> obj/decred.dll)
+* nvml.dll (relative path -> nvidia/NVSMI/nvml.dll)
+
+NOTE: CUDA does not work over remote desktop.  If one wishes to remotely manage
+a windows machine that uses CUDA one MUST use VNC instead.  This is a CUDA
+issue, not a gominer issue.
diff --git a/cgo_flags.go b/cgo_flags.go
new file mode 100644
index 0000000..dc226d2
--- /dev/null
+++ b/cgo_flags.go
@@ -0,0 +1,11 @@
+// Copyright (c) 2016 The Decred developers.
+
+// +build cuda,!opencl
+
+package main
+
+/*
+#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/decred.a
+#cgo windows LDFLAGS: -Lobj -ldecred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
+*/
+import "C"
diff --git a/cl/buffer.go b/cl/buffer.go
index eda27e3..d318fa4 100644
--- a/cl/buffer.go
+++ b/cl/buffer.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/cgo_flags.go b/cl/cgo_flags.go
new file mode 100644
index 0000000..a5363dd
--- /dev/null
+++ b/cl/cgo_flags.go
@@ -0,0 +1,8 @@
+package cl
+
+/*
+#cgo CFLAGS: -I CL
+#cgo !darwin LDFLAGS: -lOpenCL
+#cgo darwin LDFLAGS: -framework OpenCL
+*/
+import "C"
diff --git a/cl/cl.go b/cl/cl.go
index 5d83127..2cb0e28 100644
--- a/cl/cl.go
+++ b/cl/cl.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/cl.h b/cl/cl.h
new file mode 100644
index 0000000..e2d8943
--- /dev/null
+++ b/cl/cl.h
@@ -0,0 +1,14 @@
+#ifndef CL_H
+#define CL_H
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include "CL/opencl.h"
+#endif
+
+#endif /* CL_H */
diff --git a/cl/context.go b/cl/context.go
index 09f0e37..00bbbac 100644
--- a/cl/context.go
+++ b/cl/context.go
@@ -1,42 +1,48 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
+#include "cl.h"
 
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+// this needs to come out
+typedef void*	pVoid;
+extern void	go_ctx_notify(char *errinfo, void *private_info, int cb,
+		    void *user_data);
 
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
-
-extern void go_ctx_notify(char *errinfo, void *private_info, int cb, void *user_data);
-static void CL_CALLBACK c_ctx_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data) {
+static void CL_CALLBACK	c_ctx_notify(const char *errinfo,
+			    const void *private_info, size_t cb,
+			    void *user_data)
+{
 	go_ctx_notify((char *)errinfo, (void *)private_info, cb, user_data);
 }
 
-typedef void* pVoid;
-static pVoid* allocArray(size_t n) { return (pVoid*)malloc(n * sizeof(pVoid)); }
-static void   freeArray (pVoid* p) { free(p); }
+static pVoid
+*allocArray(size_t n)
+{
+	return (pVoid*)malloc(n * sizeof(pVoid));
+}
 
-static cl_context CLCreateContext(	const cl_context_properties *  	properties,
-					                cl_uint                  		num_devices,
-					                const cl_device_id *     		devices,
-					                void *                   		user_data,
-					                cl_int *                 		errcode_ret){
-	return clCreateContext(properties, num_devices, devices, c_ctx_notify, user_data, errcode_ret);
+static void
+freeArray(pVoid* p)
+{
+	free(p);
 }
 
-static cl_context CLCreateContextFromType(	const cl_context_properties *  	properties,
-					                		cl_device_type     				device_type,
-					                		void *                   		user_data,
-					                		cl_int *                 		errcode_ret){
-    return clCreateContextFromType(properties, device_type, c_ctx_notify, user_data, errcode_ret);
+static cl_context
+CLCreateContext(const cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices, void *user_data, cl_int *errcode_ret)
+{
+	return clCreateContext(properties, num_devices, devices, c_ctx_notify,
+	    user_data, errcode_ret);
 }
+
+static cl_context
+CLCreateContextFromType(const cl_context_properties *properties,
+    cl_device_type device_type, void *user_data, cl_int *errcode_ret)
+{
+	return clCreateContextFromType(properties, device_type, c_ctx_notify,
+	    user_data, errcode_ret);
+}
+
 */
 import "C"
 import "unsafe"
diff --git a/cl/device.go b/cl/device.go
index 3558406..d966d63 100644
--- a/cl/device.go
+++ b/cl/device.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 import "unsafe"
diff --git a/cl/event.go b/cl/event.go
index ec55b4f..fb75971 100644
--- a/cl/event.go
+++ b/cl/event.go
@@ -1,28 +1,23 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
-
-extern void go_evt_notify(cl_event event, cl_int event_command_exec_status, void *user_data);
-static void CL_CALLBACK c_evt_notify(cl_event event, cl_int event_command_exec_status, void *user_data) {
+#include "cl.h"
+
+// this needs to come out
+extern void	go_evt_notify(cl_event event, cl_int event_command_exec_status,
+		    void *user_data);
+static void CL_CALLBACK
+c_evt_notify(cl_event event, cl_int event_command_exec_status, void *user_data)
+{
 	go_evt_notify(event, event_command_exec_status, user_data);
 }
 
-static cl_int CLSetEventCallback(	cl_event event,
-									cl_int command_exec_callback_type,
-									void *user_data){
-    return clSetEventCallback(event, command_exec_callback_type, c_evt_notify, user_data);
+static cl_int
+CLSetEventCallback(cl_event event, cl_int command_exec_callback_type,
+    void *user_data)
+{
+    return clSetEventCallback(event, command_exec_callback_type, c_evt_notify,
+        user_data);
 }
 */
 import "C"
diff --git a/cl/event11.go b/cl/event11.go
index 5f4fb2d..ef81cda 100644
--- a/cl/event11.go
+++ b/cl/event11.go
@@ -1,19 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
-
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/image.go b/cl/image.go
index 73565ae..a65b66f 100644
--- a/cl/image.go
+++ b/cl/image.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/image11.go b/cl/image11.go
index 8133ae9..cac7e55 100644
--- a/cl/image11.go
+++ b/cl/image11.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 import "unsafe"
diff --git a/cl/kernel.go b/cl/kernel.go
index 73b4ec9..08719b5 100644
--- a/cl/kernel.go
+++ b/cl/kernel.go
@@ -1,20 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
-#include <string.h>
-#include <stdlib.h>
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/kernel1x.go b/cl/kernel1x.go
index 864a9b1..0da8d20 100644
--- a/cl/kernel1x.go
+++ b/cl/kernel1x.go
@@ -1,19 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/memory.go b/cl/memory.go
index 1c47c4b..5a1c6cc 100644
--- a/cl/memory.go
+++ b/cl/memory.go
@@ -1,24 +1,18 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
-extern void go_mem_notify(cl_mem memobj, void *user_data);
-static void CL_CALLBACK c_mem_notify(cl_mem memobj, void *user_data) {
+#include "cl.h"
+extern void	go_mem_notify(cl_mem memobj, void *user_data);
+
+static void CL_CALLBACK
+c_mem_notify(cl_mem memobj, void *user_data)
+{
 	go_mem_notify(memobj, user_data);
 }
 
-static cl_int CLSetMemObjectDestructorCallback(cl_mem memobj, void *user_data){
+static cl_int
+CLSetMemObjectDestructorCallback(cl_mem memobj, void *user_data)
+{
     return clSetMemObjectDestructorCallback(memobj, c_mem_notify, user_data);
 }
 */
diff --git a/cl/platform.go b/cl/platform.go
index cdc90c0..e6c2679 100644
--- a/cl/platform.go
+++ b/cl/platform.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 import "unsafe"
diff --git a/cl/program.go b/cl/program.go
index b36a287..ee68491 100644
--- a/cl/program.go
+++ b/cl/program.go
@@ -1,33 +1,26 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
+
+// this needs to come out
 #include <string.h>
 #include <stdlib.h>
 
-extern void go_prg_notify(cl_program program, void *user_data);
-static void CL_CALLBACK c_prg_build_notify(cl_program program, void *user_data) {
+extern void	go_prg_notify(cl_program program, void *user_data);
+
+static void CL_CALLBACK
+c_prg_build_notify(cl_program program, void *user_data)
+{
 	go_prg_notify(program, user_data);
 }
 
-static cl_int CLBuildProgram(cl_program program,
-							cl_uint num_devices,
-							const cl_device_id *devices,
-							const char *options,
-							void *user_data){
-
-    return clBuildProgram(program, num_devices, devices, options, c_prg_build_notify, user_data);
+static cl_int
+CLBuildProgram(cl_program program, cl_uint num_devices,
+    const cl_device_id *devices, const char *options, void *user_data)
+{
+	return clBuildProgram(program, num_devices, devices, options,
+	    c_prg_build_notify, user_data);
 }
 */
 import "C"
diff --git a/cl/program11.go b/cl/program11.go
index 40a1c15..4463f00 100644
--- a/cl/program11.go
+++ b/cl/program11.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/queue.go b/cl/queue.go
index e051a52..074663f 100644
--- a/cl/queue.go
+++ b/cl/queue.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/queue1x.go b/cl/queue1x.go
index c4240c8..1ad602b 100644
--- a/cl/queue1x.go
+++ b/cl/queue1x.go
@@ -1,19 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/sampler.go b/cl/sampler.go
index 16f5780..3f5e0c3 100644
--- a/cl/sampler.go
+++ b/cl/sampler.go
@@ -1,18 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cl/sampler1x.go b/cl/sampler1x.go
index ef6870e..f79baa2 100644
--- a/cl/sampler1x.go
+++ b/cl/sampler1x.go
@@ -1,19 +1,7 @@
 package cl
 
 /*
-#cgo CFLAGS: -I CL
-#cgo !darwin LDFLAGS: -lOpenCL
-#cgo darwin LDFLAGS: -framework OpenCL
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-
-#ifdef __APPLE__
-#include "OpenCL/opencl.h"
-#else
-#include "CL/opencl.h"
-#endif
+#include "cl.h"
 */
 import "C"
 
diff --git a/cudakernel_static.go b/cudakernel_static.go
new file mode 100644
index 0000000..d574f75
--- /dev/null
+++ b/cudakernel_static.go
@@ -0,0 +1,26 @@
+// Copyright (c) 2016 The Decred developers.
+
+//+build linux,cuda darwin,cuda
+
+package main
+
+/*
+#include "decred.h"
+*/
+import "C"
+import (
+	"github.com/mumax/3/cuda/cu"
+	"unsafe"
+)
+
+func cudaPrecomputeTable(input *[192]byte) {
+	if input == nil {
+		panic("input is nil")
+	}
+	C.decred_cpu_setBlock_52((*C.uint32_t)(unsafe.Pointer(input)))
+}
+
+func cudaInvokeKernel(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
+	C.decred_hash_nonce(C.uint32_t(gridx), C.uint32_t(blockx), C.uint32_t(threads),
+		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
+}
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
new file mode 100644
index 0000000..a8cadab
--- /dev/null
+++ b/cudakernel_windows.go
@@ -0,0 +1,27 @@
+// Copyright (c) 2016 The Decred developers.
+// +build cuda
+
+package main
+
+import (
+	"syscall"
+	"unsafe"
+
+	"github.com/mumax/3/cuda/cu"
+)
+
+var (
+	//kernelDll           = syscall.MustLoadDLL("decred.dll")
+	kernelDll               = syscall.MustLoadDLL("decred.dll")
+	precomputeTableProcAddr = kernelDll.MustFindProc("decred_cpu_setBlock_52").Addr()
+	kernelProcAddr          = kernelDll.MustFindProc("decred_hash_nonce").Addr()
+)
+
+func cudaPrecomputeTable(input *[192]byte) {
+	syscall.Syscall(precomputeTableProcAddr, 1, uintptr(unsafe.Pointer(input)), 0, 0)
+}
+
+func cudaInvokeKernel(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
+	syscall.Syscall6(kernelProcAddr, 6, uintptr(gridx), uintptr(blockx), uintptr(threads),
+		uintptr(startNonce), uintptr(nonceResults), uintptr(targetHigh))
+}
diff --git a/cudevice.go b/cudevice.go
index 0ec7cbb..a7d15e7 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -5,12 +5,10 @@
 package main
 
 /*
-#cgo LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/cuda.a
-#include <stdint.h>
-void decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads, uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh);
-void decred_cpu_setBlock_52(const uint32_t *input);
+#include "decred.h"
 */
 import "C"
+
 import (
 	"encoding/binary"
 	"fmt"
diff --git a/decred.cu b/decred.cu
index e37af60..f755a1d 100644
--- a/decred.cu
+++ b/decred.cu
@@ -14,6 +14,12 @@
 #include <memory.h>
 #include <miner.h>
 
+#if defined(_WIN32)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif /* _WIN32 */
+
 extern "C" {
 #include <sph/sph_blake.h>
 }
@@ -179,15 +185,17 @@ __global__ void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t sta
 }
 
 extern "C" {
-void decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads, uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh)
+DLLEXPORT void
+decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads,
+    uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh)
 {
 	decred_gpu_hash_nonce <<<grid, block>>> (threads, startNonce, resNonce, targetHigh);
 }
 }
 
 extern "C" {
-__host__
-void decred_cpu_setBlock_52(const uint32_t *input)
+__host__ DLLEXPORT void
+decred_cpu_setBlock_52(const uint32_t *input)
 {
 	/*
 	for (int i = 0; i < 180/4; i++)
diff --git a/decred.h b/decred.h
new file mode 100644
index 0000000..fb7ca6e
--- /dev/null
+++ b/decred.h
@@ -0,0 +1,18 @@
+#ifndef DECRED_H
+#define DECRED_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+void	decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads,
+	    uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh);
+void	decred_cpu_setBlock_52(const uint32_t *input);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* DECRED_H */
diff --git a/nvml/cgo_flags.go b/nvml/cgo_flags.go
new file mode 100644
index 0000000..7425a2f
--- /dev/null
+++ b/nvml/cgo_flags.go
@@ -0,0 +1,7 @@
+package nvml
+
+/*
+#cgo !windows LDFLAGS: -lnvidia-ml
+#cgo windows LDFLAGS: -L../nvidia/NVSMI/ -lnvml
+*/
+import "C"
diff --git a/nvml/nvml.go b/nvml/nvml.go
index 1a7eab1..15a7a23 100755
--- a/nvml/nvml.go
+++ b/nvml/nvml.go
@@ -1,9 +1,10 @@
 package nvml
 
-// #cgo LDFLAGS: -lnvidia-ml
-// #include <stdio.h>
-// #include <stdlib.h>
-// #include <nvml.h>
+/*
+#include <stdio.h>
+#include <stdlib.h>
+#include <nvml.h>
+*/
 import "C"
 
 import (

From c236ba18ca15edb330c253c7170770ce0a425ab7 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 22 Sep 2016 15:53:07 -0400
Subject: [PATCH 065/150] Remove some unused and unneeded code

Closes #111
---
 decred.cu | 121 ------------------------------------------------------
 device.go |  34 ---------------
 miner.h   |  75 ---------------------------------
 3 files changed, 230 deletions(-)

diff --git a/decred.cu b/decred.cu
index f755a1d..19a123a 100644
--- a/decred.cu
+++ b/decred.cu
@@ -53,10 +53,6 @@ __constant__ uint32_t _ALIGN(16) c_h[2];
 __constant__ uint32_t _ALIGN(16) c_data[32];
 __constant__ uint32_t _ALIGN(16) c_xors[215];
 
-/* Buffers of candidate nonce(s) */
-static uint32_t *d_resNonce[MAX_GPUS];
-static uint32_t *h_resNonce[MAX_GPUS];
-
 #define ROR8(a)  __byte_perm(a, 0, 0x0321)
 #define ROL16(a) __byte_perm(a, 0, 0x1032)
 
@@ -361,120 +357,3 @@ decred_cpu_setBlock_52(const uint32_t *input)
 
 /* ############################################################################################################################### */
 
-static bool init[MAX_GPUS] = { 0 };
-
-// nonce position is different in decred
-#define DCR_NONCE_OFT32 35
-
-#if 0
-extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t _ALIGN(64) endiandata[48];
-
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	uint32_t *pnonce = &pdata[DCR_NONCE_OFT32];
-
-	const uint32_t first_nonce = *pnonce;
-	const uint32_t targetHigh = opt_benchmark ? 0x1ULL : ptarget[6];
-
-	const int dev_id = device_map[thr_id];
-	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 29 : 25;
-	if (device_sm[dev_id] < 350) intensity = 22;
-
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
-	const dim3 grid((throughput + TPB-1)/(TPB));
-	const dim3 block(TPB);
-
-	if (!init[thr_id]){
-		cudaSetDevice(dev_id);
-		if (opt_cudaschedule == -1 && gpu_threads == 1) {
-			cudaDeviceReset();
-			// reduce cpu usage (linux)
-			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-			CUDA_LOG_ERROR();
-		}
-
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
-		init[thr_id] = true;
-	}
-	memcpy(endiandata, pdata, 180);
-
-	decred_cpu_setBlock_52(endiandata);
-	h_resNonce[thr_id][0] = 1;
-
-	do {
-		if (h_resNonce[thr_id][0])
-			cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
-
-		// GPU HASH
-		decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
-		cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-		if (h_resNonce[thr_id][0])
-		{
-			cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], (h_resNonce[thr_id][0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-			for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++)
-			{
-				uint32_t _ALIGN(64) vhash[8];
-				be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]);
-				decred_hash(vhash, endiandata);
-				if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
-				{
-					int rc = 1;
-					work_set_target_ratio(work, vhash);
-					*hashes_done = (*pnonce) - first_nonce + throughput;
-					work->nonces[0] = swab32(h_resNonce[thr_id][i]);
-					// search for another nonce
-					for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++)
-					{
-						be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]);
-						decred_hash(vhash, endiandata);
-						if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)){
-							work->nonces[1] = swab32(h_resNonce[thr_id][j]);
-							if(!opt_quiet)
-								gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]);
-							if(bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
-								work_set_target_ratio(work, vhash);
-								xchg(work->nonces[1], work->nonces[0]);
-							}
-							rc = 2;
-							break;
-						}
-					}
-					*pnonce = work->nonces[0];
-					return rc;
-				} else {
-					gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", i, h_resNonce[thr_id][i]);
-				}
-			}
-		}
-		*pnonce += throughput;
-
-	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + (*pnonce));
-
-	*hashes_done = (*pnonce) - first_nonce;
-	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
-	return 0;
-}
-#endif
-
-// cleanup
-extern "C" void free_decred(int thr_id)
-{
-	if (!init[thr_id])
-		return;
-
-	cudaDeviceSynchronize();
-	cudaFreeHost(h_resNonce[thr_id]);
-	cudaFree(d_resNonce[thr_id]);
-
-	init[thr_id] = false;
-
-	cudaDeviceSynchronize();
-}
diff --git a/device.go b/device.go
index 4d97108..a269205 100644
--- a/device.go
+++ b/device.go
@@ -5,7 +5,6 @@ package main
 import (
 	"encoding/binary"
 	"encoding/hex"
-	"math/big"
 	"sync/atomic"
 	"time"
 
@@ -74,39 +73,6 @@ func (d *Device) Run() {
 	}
 }
 
-// testFoundCandidate has some hardcoded data to match up with sgminer.
-func (d *Device) testFoundCandidate() {
-	n1 := uint32(33554432)
-	n0 := uint32(7245027)
-
-	d.midstate[0] = uint32(2421507776)
-	d.midstate[1] = uint32(2099684366)
-	d.midstate[2] = uint32(8033620)
-	d.midstate[3] = uint32(950943511)
-	d.midstate[4] = uint32(2489053653)
-	d.midstate[5] = uint32(3357747798)
-	d.midstate[6] = uint32(2534384973)
-	d.midstate[7] = uint32(2947973092)
-
-	target, _ := hex.DecodeString("00000000ffff0000000000000000000000000000000000000000000000000000")
-	bigTarget := new(big.Int)
-	bigTarget.SetString(hex.EncodeToString(target), 16)
-	d.work.Target = bigTarget
-
-	data, _ := hex.DecodeString("01000000509a3b7c65f8986a464c0e82ec5ca6aaf18cf13787507cbfc20a000000000000a455f69725e9c8623baa3c9c5a708aefb947702dc2b620b4c10129977e104c0275571a5ca5b1308b075fe74224504c9e6b1153f3de97235e7a8c7e58ea8f1c55010086a1d41fb3ee05000000fda400004a33121a2db33e1101000000abae0000260800008ec78357000000000000000000a461f2e3014335000000000000000000000000000000000000000000000000000000000000000000000000")
-	copy(d.work.Data[:], data)
-
-	minrLog.Errorf("data: %v", d.work.Data)
-	minrLog.Errorf("target: %v", d.work.Target)
-	minrLog.Errorf("nonce1 %x, nonce0: %x", n1, n0)
-
-	// d.foundCandidate(n1, n0, ts)
-
-	//need to match
-	//00000000df6ffb6059643a9215f95751baa7b1ed8aa93edfeb9a560ecb1d5884
-	//stratum submit {"params": ["test", "76df", "0200000000a461f2e3014335", "5783c78e", "e38c6e00"], "id": 4, "method": "mining.submit"}
-}
-
 func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	d.Lock()
 	defer d.Unlock()
diff --git a/miner.h b/miner.h
index f2c75c3..4b3e1ae 100644
--- a/miner.h
+++ b/miner.h
@@ -251,13 +251,6 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len);
 
 struct work;
 
-extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-
-/* free device allocated memory per algo */
-void algo_free_all(int thr_id);
-
-extern void free_decred(int thr_id);
-
 /* api related */
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
@@ -412,37 +405,6 @@ void cuda_log_lasterror(int thr_id, const char* func, int line);
 void cuda_clear_lasterror();
 #define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__)
 
-#define CL_N    "\x1B[0m"
-#define CL_RED  "\x1B[31m"
-#define CL_GRN  "\x1B[32m"
-#define CL_YLW  "\x1B[33m"
-#define CL_BLU  "\x1B[34m"
-#define CL_MAG  "\x1B[35m"
-#define CL_CYN  "\x1B[36m"
-
-#define CL_BLK  "\x1B[22;30m" /* black */
-#define CL_RD2  "\x1B[22;31m" /* red */
-#define CL_GR2  "\x1B[22;32m" /* green */
-#define CL_YL2  "\x1B[22;33m" /* dark yellow */
-#define CL_BL2  "\x1B[22;34m" /* blue */
-#define CL_MA2  "\x1B[22;35m" /* magenta */
-#define CL_CY2  "\x1B[22;36m" /* cyan */
-#define CL_SIL  "\x1B[22;37m" /* gray */
-
-#ifdef WIN32
-#define CL_GRY  "\x1B[01;30m" /* dark gray */
-#else
-#define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
-#endif
-#define CL_LRD  "\x1B[01;31m" /* light red */
-#define CL_LGR  "\x1B[01;32m" /* light green */
-#define CL_LYL  "\x1B[01;33m" /* tooltips */
-#define CL_LBL  "\x1B[01;34m" /* light blue */
-#define CL_LMA  "\x1B[01;35m" /* light magenta */
-#define CL_LCY  "\x1B[01;36m" /* light cyan */
-
-#define CL_WHT  "\x1B[01;37m" /* white */
-
 extern void format_hashrate(double hashrate, char *output);
 extern void applog(int prio, const char *fmt, ...);
 extern void gpulog(int prio, int thr_id, const char *fmt, ...);
@@ -651,44 +613,7 @@ void applog_hash64(void *hash);
 void applog_compare_hash(void *hash, void *hash_ref);
 
 void print_hash_tests(void);
-void blake256hash(void *output, const void *input, int8_t rounds);
-void blake2s_hash(void *output, const void *input);
-void bmw_hash(void *state, const void *input);
-void c11hash(void *output, const void *input);
 void decred_hash(void *state, const void *input);
-void deephash(void *state, const void *input);
-void luffa_hash(void *state, const void *input);
-void fresh_hash(void *state, const void *input);
-void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
-void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
-void keccak256_hash(void *state, const void *input);
-unsigned int jackpothash(void *state, const void *input);
-void groestlhash(void *state, const void *input);
-void lbry_hash(void *output, const void *input);
-void lyra2re_hash(void *state, const void *input);
-void lyra2v2_hash(void *state, const void *input);
-void myriadhash(void *state, const void *input);
-void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
-void nist5hash(void *state, const void *input);
-void pentablakehash(void *output, const void *input);
-void quarkhash(void *state, const void *input);
-void qubithash(void *state, const void *input);
-void scrypthash(void* output, const void* input);
-void scryptjane_hash(void* output, const void* input);
-void sibhash(void *output, const void *input);
-void skeincoinhash(void *output, const void *input);
-void skein2hash(void *output, const void *input);
-void s3hash(void *output, const void *input);
-void wcoinhash(void *state, const void *input);
-void whirlxHash(void *state, const void *input);
-void x11evo_hash(void *output, const void *input);
-void x11hash(void *output, const void *input);
-void x13hash(void *output, const void *input);
-void x14hash(void *output, const void *input);
-void x15hash(void *output, const void *input);
-void x17hash(void *output, const void *input);
-void zr5hash(void *output, const void *input);
-void zr5hash_pok(void *output, uint32_t *pdata);
 
 #ifdef __cplusplus
 }

From e03d7eb7d2bbd367294bb384fa02271fa92ff982 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 26 Sep 2016 11:55:49 -0400
Subject: [PATCH 066/150] Clean up logging.

Simplify INFO line.

Add share total to global stats.

Add Global stats to solo mining output.

Closes #113
---
 device.go |  9 +++------
 miner.go  | 38 +++++++++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/device.go b/device.go
index a269205..f0daace 100644
--- a/device.go
+++ b/device.go
@@ -136,21 +136,18 @@ func (d *Device) PrintStats() {
 	temperature := atomic.LoadUint32(&d.temperature)
 
 	if fanPercent != 0 || temperature != 0 {
-		minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work, Fan=%v%% Temp=%vC",
+		minrLog.Infof("DEV #%d (%s) %v Fan=%v%% T=%vC",
 			d.index,
 			d.deviceName,
 			util.FormatHashRate(averageHashRate),
-			d.validShares,
-			d.validShares+d.invalidShares,
 			fanPercent,
 			temperature)
 	} else {
-		minrLog.Infof("DEV #%d (%s) reporting average hash rate %v, %v/%v valid work",
+		minrLog.Infof("DEV #%d (%s) %v",
 			d.index,
 			d.deviceName,
 			util.FormatHashRate(averageHashRate),
-			d.validShares,
-			d.validShares+d.invalidShares)
+		)
 	}
 }
 
diff --git a/miner.go b/miner.go
index 05a9daa..541dc7a 100644
--- a/miner.go
+++ b/miner.go
@@ -157,19 +157,35 @@ func (m *Miner) printStatsThread() {
 	defer t.Stop()
 
 	for {
-		if cfg.Pool != "" && !cfg.Benchmark {
-			valid := atomic.LoadUint64(&m.pool.ValidShares)
-			minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v",
-				valid,
-				atomic.LoadUint64(&m.pool.InvalidShares),
-				atomic.LoadUint64(&m.staleShares))
-
-			secondsElapsed := uint32(time.Now().Unix()) - m.started
-			if (secondsElapsed / 60) > 0 {
-				utility := float64(valid) / (float64(secondsElapsed) / float64(60))
-				minrLog.Infof("Global utility (accepted shares/min): %v", utility)
+		if !cfg.Benchmark {
+			if cfg.Pool != "" {
+				valid := atomic.LoadUint64(&m.pool.ValidShares)
+				rejected := atomic.LoadUint64(&m.pool.InvalidShares)
+				stale := atomic.LoadUint64(&m.staleShares)
+				total := valid + rejected + stale
+				minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v, Total: %v",
+					valid,
+					rejected,
+					stale,
+					total,
+				)
+				secondsElapsed := uint32(time.Now().Unix()) - m.started
+				if (secondsElapsed / 60) > 0 {
+					utility := float64(valid) / (float64(secondsElapsed) / float64(60))
+					minrLog.Infof("Global utility (accepted shares/min): %v", utility)
+				}
+			} else {
+				valid := atomic.LoadUint64(&m.validShares)
+				rejected := atomic.LoadUint64(&m.invalidShares)
+				total := valid + rejected
+				minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Total: %v",
+					valid,
+					rejected,
+					total,
+				)
 			}
 		}
+
 		for _, d := range m.devices {
 			d.UpdateFanTemp()
 			d.PrintStats()

From bb35163b138e1f6095e31f0cf9b7bb2d68d54d66 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Tue, 27 Sep 2016 16:39:08 -0500
Subject: [PATCH 067/150] add automatic fan control to maintain a target
 temperature (#115)

---
 adl/adl.c      |  58 ++++++++---
 adl/adl.go     |  18 +++-
 cladldevice.go | 154 ++++++++++++++++++++++++----
 cldevice.go    | 272 ++++++++++++++++++++++++++++++++++++++++---------
 config.go      |  82 +++++++++++++--
 cudevice.go    |  55 +++++++++-
 device.go      | 201 +++++++++++++++++++++++++++++++++++-
 glide.lock     |   4 +
 miner.go       |   3 +
 9 files changed, 746 insertions(+), 101 deletions(-)

diff --git a/adl/adl.c b/adl/adl.c
index 03624d0..65224b2 100644
--- a/adl/adl.c
+++ b/adl/adl.c
@@ -22,6 +22,8 @@ int ADL_Adapter_ID_Get(int iAdapterIndex, int *lpAdapterID);
 int ADL_Adapter_NumberOfAdapters_Get(int *lpNumAdapters);
 int ADL_Main_Control_Create(ADL_MAIN_MALLOC_CALLBACK callback, int iEnumConnectedAdapters);
 int ADL_Overdrive5_FanSpeed_Get(int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
+int ADL_Overdrive5_FanSpeed_Set(int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
+int ADL_Overdrive5_FanSpeedToDefault_Set(int iAdapaterIndex, int iThermalControllerIndex);
 int ADL_Overdrive5_Temperature_Get (int iAdapterIndex, int iThermalControllerIndex, ADLTemperature *lpTemperature);
 
 int getADLInfo(int deviceid, char field[64]);
@@ -50,19 +52,7 @@ static void __stdcall ADL_Main_Memory_Free (void **lpBuffer)
   }
 }
 
-int getADLFanPercent(int deviceid) {
-  int fanPercent = 0;
-  fanPercent = getADLInfo(deviceid, "fanPercent");
-  return fanPercent;
-}
-
-int getADLTemp(int deviceid) {
-  int temp = 0;
-  temp = getADLInfo(deviceid, "temp");
-  return temp;
-}
-
-int getADLInfo(int deviceid, char field[64]) {
+int doADLCommand(int deviceid, char field[64], int arg) {
   int result, i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0;
   int iNumberAdapters;
   struct gpu_adapters adapters[MAX_GPUDEVICES], vadapters[MAX_GPUDEVICES];
@@ -98,6 +88,7 @@ int getADLInfo(int deviceid, char field[64]) {
   for (i = 0; i < iNumberAdapters; i++) {
     int iAdapterIndex;
     int lpAdapterID;
+    int rv = 0;
 
     iAdapterIndex = lpInfo[i].iAdapterIndex;
 
@@ -118,7 +109,11 @@ int getADLInfo(int deviceid, char field[64]) {
     adapters[devices].id = i;
 
     if (deviceid == devices) {
-      if (strcmp(field, "fanPercent") == 0) {
+      if (strcmp(field, "fanAutoManage") == 0) {
+        rv = ADL_Overdrive5_FanSpeedToDefault_Set(iAdapterIndex, 0);
+        return rv;
+      }
+      if (strcmp(field, "getFanPercent") == 0) {
         ADLFanSpeedValue lpFanSpeedValue = {0};
         lpFanSpeedValue.iSize = sizeof(ADLFanSpeedValue);
         lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
@@ -127,7 +122,16 @@ int getADLInfo(int deviceid, char field[64]) {
         }
         return lpFanSpeedValue.iFanSpeed;
       }
-      if (strcmp(field, "temp") == 0) {
+      if (strcmp(field, "setFanPercent") == 0) {
+        ADLFanSpeedValue lpFanSpeedValue = {0};
+        lpFanSpeedValue.iFanSpeed = arg;
+        lpFanSpeedValue.iFlags |= ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED;
+        lpFanSpeedValue.iSize = sizeof(ADLFanSpeedValue);
+        lpFanSpeedValue.iSpeedType = ADL_DL_FANCTRL_SPEED_TYPE_PERCENT;
+        rv = ADL_Overdrive5_FanSpeed_Set(iAdapterIndex, 0, &lpFanSpeedValue);
+        return rv;
+      }
+      if (strcmp(field, "getTemp") == 0) {
         ADLTemperature lpTemperature = {0};
         lpTemperature.iSize = sizeof(ADLTemperature);
         lpTemperature.iTemperature = 0;
@@ -148,3 +152,27 @@ int getADLInfo(int deviceid, char field[64]) {
 
   return 0;
 }
+
+int getADLFanPercent(int deviceid) {
+  int fanPercent = 0;
+  fanPercent = doADLCommand(deviceid, "getFanPercent", 0);
+  return fanPercent;
+}
+
+int getADLTemp(int deviceid) {
+  int temp = 0;
+  temp = doADLCommand(deviceid, "getTemp", 0);
+  return temp;
+}
+
+int setADLFanAutoManage(int deviceid) {
+  int rv = 0;
+  rv = doADLCommand(deviceid, "fanAutoManage", 0);
+  return rv;
+}
+
+int setADLFanPercent(int deviceid, int fanPercent) {
+  int rv = 0;
+  rv = doADLCommand(deviceid, "setFanPercent", fanPercent);
+  return rv;
+}
diff --git a/adl/adl.go b/adl/adl.go
index a480656..4850feb 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -9,11 +9,13 @@ package adl
 #include <adl_sdk.h>
 int getADLFanPercent(int deviceid);
 int getADLTemp(int deviceid);
+int setADLFanAutoManage(int deviceid);
+int setADLFanPercent(int deviceid, int fanPercent);
 */
 import "C"
 
-// DeviceFanPercent fetches and returns fan utilization for a device index
-func DeviceFanPercent(index int) uint32 {
+// DeviceFanGetPercent fetches and returns fan utilization for a device index
+func DeviceFanGetPercent(index int) uint32 {
 	fanPercent := uint32(0)
 
 	fanPercent = uint32(C.getADLFanPercent(C.int(index)))
@@ -21,6 +23,12 @@ func DeviceFanPercent(index int) uint32 {
 	return fanPercent
 }
 
+// DeviceFanSetPercent sets the fan to a percent value for a device index
+// and returns the ADL return value
+func DeviceFanSetPercent(index int, fanPercent uint32) int {
+	return int(C.setADLFanPercent(C.int(index), C.int(fanPercent)))
+}
+
 // DeviceTemperature fetches and returns temperature for a device index
 func DeviceTemperature(index int) uint32 {
 	temperature := uint32(0)
@@ -29,3 +37,9 @@ func DeviceTemperature(index int) uint32 {
 
 	return temperature
 }
+
+// DeviceFanAutoManage sets auto-management of fanspeed for a device index
+// and returns the ADL return value
+func DeviceFanAutoManage(index int) int {
+	return int(C.setADLFanAutoManage(C.int(index)))
+}
diff --git a/cladldevice.go b/cladldevice.go
index ced0df9..791efe9 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -10,6 +10,7 @@ import (
 	"io"
 	"math"
 	"os"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -34,6 +35,12 @@ const (
 
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
 
+func appendBitfield(info, value cl.CL_bitfield, name string, str *string) {
+	if (info & value) != 0 {
+		*str += name
+	}
+}
+
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	var programBuffer [1][]byte
 	var programSize [1]cl.CL_size_t
@@ -81,16 +88,21 @@ type Device struct {
 	cuda  bool
 
 	// Items for OpenCL device
-	platformID    cl.CL_platform_id
-	deviceID      cl.CL_device_id
-	deviceName    string
-	context       cl.CL_context
-	queue         cl.CL_command_queue
-	outputBuffer  cl.CL_mem
-	program       cl.CL_program
-	kernel        cl.CL_kernel
-	fanTempActive bool
-	kind          string
+	platformID               cl.CL_platform_id
+	deviceID                 cl.CL_device_id
+	deviceName               string
+	deviceType               string
+	context                  cl.CL_context
+	queue                    cl.CL_command_queue
+	outputBuffer             cl.CL_mem
+	program                  cl.CL_program
+	kernel                   cl.CL_kernel
+	fanControlActive         bool
+	fanControlLastTemp       uint32
+	fanControlLastFanPercent uint32
+	fanTempActive            bool
+	kind                     string
+	tempTarget               uint32
 
 	//cuInput        cu.DevicePtr
 	cuInSize       int64
@@ -125,14 +137,52 @@ type Device struct {
 func deviceStats(index int) (uint32, uint32) {
 	fanPercent := uint32(0)
 	temperature := uint32(0)
-	tempDivisor := uint32(1000)
 
-	fanPercent = adl.DeviceFanPercent(index)
-	temperature = adl.DeviceTemperature(index) / tempDivisor
+	fanPercent = adl.DeviceFanGetPercent(index)
+	temperature = adl.DeviceTemperature(index) / AMDTempDivisor
 
 	return fanPercent, temperature
 }
 
+func fanControlSet(index int, fanCur uint32, tempTargetType string,
+	fanChangeLevel string) {
+	fanAdjustmentPercent := FanControlAdjustmentSmall
+	fanNewPercent := uint32(0)
+	if fanChangeLevel == ChangeLevelLarge {
+		fanAdjustmentPercent = FanControlAdjustmentLarge
+	}
+	minrLog.Tracef("DEV #%d fanControlSet fanCur %v tempTargetType %v "+
+		"fanChangeLevel %v", index, fanCur, tempTargetType, fanChangeLevel)
+
+	switch tempTargetType {
+	// Decrease the temperature by increasing the fan speed
+	case TargetLower:
+		fanNewPercent = fanCur + fanAdjustmentPercent
+		break
+	// Increase the temperature by decreasing the fan speed
+	case TargetHigher:
+		fanNewPercent = fanCur - fanAdjustmentPercent
+		break
+	}
+
+	if fanNewPercent == 0 || fanNewPercent > 100 {
+		fanNewPercent = ADLFanFailSafe
+	}
+
+	minrLog.Tracef("DEV #%d need to %v temperature; adjusting fan from "+
+		"fanCur %v%% to fanNewPercent %v%%", index,
+		strings.ToLower(tempTargetType), fanCur, fanNewPercent)
+	rv := adl.DeviceFanSetPercent(index, fanNewPercent)
+	if rv < 0 {
+		minrLog.Errorf("DEV #%d unable to adjust fan ADL error code: %v", index,
+			rv)
+	} else {
+		minrLog.Infof("DEV #%d successfully adjusted fan from %v%% to %v%% "+
+			"to %v temp", index, fanCur, fanNewPercent,
+			strings.ToLower(tempTargetType))
+	}
+}
+
 func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
 	var platformID cl.CL_platform_id
 	platformIDs, err := getCLPlatforms()
@@ -209,12 +259,18 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		platformID:  platformID,
 		deviceID:    deviceID,
 		deviceName:  getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
-		kind:        "adl",
+		deviceType:  getDeviceInfo(deviceID, cl.CL_DEVICE_TYPE, "CL_DEVICE_TYPE"),
+		kind:        DeviceKindUnknown,
 		quit:        make(chan struct{}),
 		newWork:     make(chan *work.Work, 5),
 		workDone:    workDone,
 		fanPercent:  0,
 		temperature: 0,
+		tempTarget:  0,
+	}
+
+	if d.deviceType == DeviceTypeGPU {
+		d.kind = DeviceKindADL
 	}
 
 	var status cl.CL_int
@@ -348,13 +404,51 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		d.index, globalWorkSize, intensity)
 	d.workSize = globalWorkSize
 
-	fanPercent, temperature := deviceStats(d.index)
-	// Newer cards will idle with the fan off so just check if we got
-	// a good temperature reading
-	if temperature != 0 {
-		atomic.StoreUint32(&d.fanPercent, fanPercent)
-		atomic.StoreUint32(&d.temperature, temperature)
-		d.fanTempActive = true
+	switch d.kind {
+	case DeviceKindADL:
+		fanPercent, temperature := deviceStats(d.index)
+		// Newer cards will idle with the fan off so just check if we got
+		// a good temperature reading
+		if temperature != 0 {
+			atomic.StoreUint32(&d.fanPercent, fanPercent)
+			atomic.StoreUint32(&d.temperature, temperature)
+			d.fanTempActive = true
+		}
+		break
+	}
+
+	// Check if temperature target is specified
+	if len(cfg.TempTargetInts) > 0 {
+		// Apply the first setting as a global setting
+		d.tempTarget = cfg.TempTargetInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.TempTargetInts {
+			if i == order {
+				d.tempTarget = uint32(cfg.TempTargetInts[order])
+			}
+		}
+		d.fanControlActive = true
+	}
+
+	// validate that we can actually do fan control
+	fanControlNotWorking := false
+	if d.tempTarget > 0 {
+		// validate that fan control is supported
+		if !d.fanControlSupported(d.kind) {
+			return nil, fmt.Errorf("temperature target of %v for device #%v; "+
+				"fan control is not supported on device kind %v", d.tempTarget,
+				index, d.kind)
+		}
+		if !d.fanTempActive {
+			minrLog.Errorf("DEV #%d ignoring temperature target of %v; "+
+				"could not get initial %v read", index, d.tempTarget, d.kind)
+			fanControlNotWorking = true
+		}
+		if fanControlNotWorking {
+			d.tempTarget = 0
+			d.fanControlActive = false
+		}
 	}
 
 	return d, nil
@@ -539,6 +633,23 @@ func getDeviceInfo(id cl.CL_device_id,
 		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
 	}
 
+	switch name {
+	case cl.CL_DEVICE_TYPE:
+		var deviceTypeStr string
+
+		appendBitfield(cl.CL_bitfield(info.(cl.CL_device_type)),
+			cl.CL_bitfield(cl.CL_DEVICE_TYPE_CPU),
+			DeviceTypeCPU,
+			&deviceTypeStr)
+
+		appendBitfield(cl.CL_bitfield(info.(cl.CL_device_type)),
+			cl.CL_bitfield(cl.CL_DEVICE_TYPE_GPU),
+			DeviceTypeGPU,
+			&deviceTypeStr)
+
+		info = deviceTypeStr
+	}
+
 	strinfo := fmt.Sprintf("%v", info)
 
 	return strinfo
@@ -550,4 +661,5 @@ func (d *Device) Release() {
 	cl.CLReleaseCommandQueue(d.queue)
 	cl.CLReleaseMemObject(d.outputBuffer)
 	cl.CLReleaseContext(d.context)
+	adl.DeviceFanAutoManage(d.index)
 }
diff --git a/cldevice.go b/cldevice.go
index 9209aa0..1c29572 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -23,6 +23,8 @@ import (
 	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
+
+	"golang.org/x/sys/unix"
 )
 
 // Return the GPU library in use.
@@ -38,6 +40,126 @@ const (
 
 var zeroSlice = []cl.CL_uint{cl.CL_uint(0)}
 
+func appendBitfield(info, value cl.CL_bitfield, name string, str *string) {
+	if (info & value) != 0 {
+		*str += name
+	}
+}
+
+func amdgpuFanPercentToValue(percent uint32) uint32 {
+	value := AMDGPUFanFailSafe
+
+	calculatedValue := float64(AMDGPUFanMax) * float64(percent) / float64(100)
+
+	if calculatedValue > 0 {
+		value = uint32(calculatedValue)
+	} else {
+		minrLog.Errorf("amdgpuFanPercentToValue() failed; using failsafe "+
+			"value of %v", AMDGPUFanFailSafe)
+	}
+
+	return value
+}
+
+// validate that we can write to the AMDGPU sysfs fan path
+func amdgpuFanPermissionsValid(index int) error {
+	path := amdgpuGetSysfsPath(index, "fan")
+
+	err := unix.Access(path, unix.W_OK)
+	if err != nil {
+		return fmt.Errorf("path %v is not writable", path)
+	}
+
+	return nil
+}
+
+func amdgpuGetSysfsPath(index int, field string) string {
+	cardPath := fmt.Sprintf("%s%d", "/sys/class/drm/card", index)
+	driverPath := "/sys/module/amdgpu"
+
+	if field == "card" {
+		return cardPath
+	}
+	if field == "driver" {
+		return driverPath
+	}
+
+	// find hwmon/hwmon<number>
+	hwmonBasePath := fmt.Sprintf("%s%d%s", "/sys/class/drm/card", index, "/device/hwmon/")
+	hwmonName := ""
+
+	// open hwmon base path and scan for the numbered entry
+	files, err := ioutil.ReadDir(hwmonBasePath)
+	if err != nil {
+		minrLog.Errorf("unable to read AMDGPU sysfs dir %v: %v", hwmonBasePath,
+			err)
+		return "unknown"
+	}
+
+	for _, f := range files {
+		// we should only find one entry but the API may not be stable
+		if strings.Contains(f.Name(), "hwmon") {
+			hwmonName = f.Name()
+		}
+	}
+
+	if hwmonName == "" {
+		minrLog.Errorf("unable to find full hwmon path")
+		return "unknown"
+	}
+
+	hwmonFullPath := fmt.Sprintf("%s/%s/", hwmonBasePath, hwmonName)
+
+	switch field {
+	case "fan":
+		return hwmonFullPath + "pwm1"
+	case "temp":
+		return hwmonFullPath + "temp1_input"
+	}
+
+	return "unknown"
+}
+
+func fanControlSet(index int, fanCur uint32, tempTargetType string,
+	fanChangeLevel string) {
+	fanAdjustmentPercent := FanControlAdjustmentSmall
+	fanNewPercent := uint32(0)
+	fanNewValue := uint32(0)
+	if fanChangeLevel == ChangeLevelLarge {
+		fanAdjustmentPercent = FanControlAdjustmentLarge
+	}
+	minrLog.Tracef("DEV #%d fanControlSet fanCur %v tempTargetType %v "+
+		"fanChangeLevel %v", index, fanCur, tempTargetType, fanChangeLevel)
+
+	switch tempTargetType {
+	// Decrease the temperature by increasing the fan speed
+	case TargetLower:
+		fanNewPercent = fanCur + fanAdjustmentPercent
+		fanNewValue = amdgpuFanPercentToValue(fanNewPercent)
+		break
+	// Increase the temperature by decreasing the fan speed
+	case TargetHigher:
+		fanNewPercent = fanCur - fanAdjustmentPercent
+		fanNewValue = amdgpuFanPercentToValue(fanNewPercent)
+		break
+	}
+
+	fanPath := amdgpuGetSysfsPath(index, "fan")
+
+	minrLog.Tracef("DEV #%d need to %v temperature; adjusting fan from "+
+		"fanCur %v%% to fanNewPercent %v%% by writing fanNewValue %v to %v",
+		index, strings.ToLower(tempTargetType), fanCur, fanNewPercent,
+		fanNewValue, fanPath)
+	err := deviceStatsWriteSysfsEntry(fanPath, fanNewValue)
+	if err != nil {
+		minrLog.Errorf("DEV #%d unable to adjust fan: %v", index, err.Error())
+	} else {
+		minrLog.Infof("DEV #%d successfully adjusted fan from %v%% to %v%% to "+
+			"%v temp", index, fanCur, fanNewPercent,
+			strings.ToLower(tempTargetType))
+	}
+}
+
 func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	var programBuffer [1][]byte
 	var programSize [1]cl.CL_size_t
@@ -85,16 +207,21 @@ type Device struct {
 	cuda  bool
 
 	// Items for OpenCL device
-	platformID    cl.CL_platform_id
-	deviceID      cl.CL_device_id
-	deviceName    string
-	context       cl.CL_context
-	queue         cl.CL_command_queue
-	outputBuffer  cl.CL_mem
-	program       cl.CL_program
-	kernel        cl.CL_kernel
-	fanTempActive bool
-	kind          string
+	platformID               cl.CL_platform_id
+	deviceID                 cl.CL_device_id
+	deviceName               string
+	deviceType               string
+	context                  cl.CL_context
+	queue                    cl.CL_command_queue
+	outputBuffer             cl.CL_mem
+	program                  cl.CL_program
+	kernel                   cl.CL_kernel
+	fanControlActive         bool
+	fanControlLastTemp       uint32
+	fanControlLastFanPercent uint32
+	fanTempActive            bool
+	kind                     string
+	tempTarget               uint32
 
 	//cuInput        cu.DevicePtr
 	cuInSize       int64
@@ -129,16 +256,20 @@ type Device struct {
 // If the device order and OpenCL index are ever not the same then we can
 // implement topology finding code:
 // https://github.com/Oblomov/clinfo/blob/master/src/clinfo.c#L1061-L1126
-func determineDeviceKind(index int, deviceName string) string {
-	deviceKind := "unknown"
+func determineDeviceKind(index int, deviceType string) string {
+	deviceKind := DeviceKindUnknown
+
+	if deviceType == DeviceTypeCPU {
+		return deviceKind
+	}
 
 	switch runtime.GOOS {
 	case "linux":
-		// check if the amdgpu driver is loaded
-		if _, err := os.Stat("/sys/module/amdgpu"); err == nil {
+		// check if the AMDGPU driver is loaded
+		if _, err := os.Stat(amdgpuGetSysfsPath(index, "driver")); err == nil {
 			// make sure a sysfs entry exists for the index of this device
-			if _, err := os.Stat("/sys/class/drm/card" + strconv.Itoa(index)); err == nil {
-				deviceKind = "amdgpu"
+			if _, err := os.Stat(amdgpuGetSysfsPath(index, "card")); err == nil {
+				deviceKind = DeviceKindAMDGPU
 			}
 		}
 		break
@@ -148,39 +279,13 @@ func determineDeviceKind(index int, deviceName string) string {
 }
 
 func deviceStats(index int) (uint32, uint32) {
-	basePath := "/sys/class/drm/card_I_/device/hwmon/"
-	basePath = strings.Replace(basePath, "_I_", strconv.Itoa(index), 1)
 	fanPercent := uint32(0)
-	hwmonPath := basePath + "_HWMON_/"
-	hwmonName := ""
 	temperature := uint32(0)
 
-	files, err := ioutil.ReadDir(basePath)
-	if err != nil {
-		minrLog.Errorf("unable to read AMDGPU sysfs dir: %v", err)
-		return fanPercent, temperature
-	}
-
-	for _, f := range files {
-		// we should only find one entry but the API may not be stable
-		if strings.Contains(f.Name(), "hwmon") {
-			hwmonName = f.Name()
-		}
-	}
-
-	if hwmonName == "" {
-		minrLog.Errorf("unable to determine AMDGPU hwmon path")
-		return fanPercent, temperature
-	}
-
-	hwmonPath = strings.Replace(hwmonPath, "_HWMON_", hwmonName, 1)
-	pwmMax := uint32(255) // could read this from pwm1_max but it seems to be a constant
-	tempDivisor := uint32(1000)
-
-	fanPercent = deviceStatsReadSysfsEntry(hwmonPath + "pwm1")
-	fanPercentFloat := float64(fanPercent) / float64(pwmMax) * float64(100)
+	fanPercent = deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "fan"))
+	fanPercentFloat := float64(fanPercent) / float64(AMDGPUFanMax) * float64(100)
 	fanPercent = uint32(fanPercentFloat)
-	temperature = deviceStatsReadSysfsEntry(hwmonPath+"temp1_input") / tempDivisor
+	temperature = deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "temp")) / AMDTempDivisor
 
 	return fanPercent, temperature
 }
@@ -217,6 +322,16 @@ func deviceStatsReadSysfsEntry(path string) uint32 {
 	return res
 }
 
+func deviceStatsWriteSysfsEntry(path string, value uint32) error {
+	stringValue := strconv.Itoa(int(value)) + "\n"
+	err := ioutil.WriteFile(path, []byte(stringValue), 0644)
+	if err != nil {
+		return fmt.Errorf("unable to write %v to %v: %v", value, path, err)
+	}
+
+	return nil
+}
+
 func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
 	var platformID cl.CL_platform_id
 	platformIDs, err := getCLPlatforms()
@@ -293,11 +408,13 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		platformID:  platformID,
 		deviceID:    deviceID,
 		deviceName:  getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
+		deviceType:  getDeviceInfo(deviceID, cl.CL_DEVICE_TYPE, "CL_DEVICE_TYPE"),
 		quit:        make(chan struct{}),
 		newWork:     make(chan *work.Work, 5),
 		workDone:    workDone,
 		fanPercent:  0,
 		temperature: 0,
+		tempTarget:  0,
 	}
 
 	var status cl.CL_int
@@ -432,10 +549,10 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	d.workSize = globalWorkSize
 
 	// Determine the device/driver kind
-	d.kind = determineDeviceKind(d.index, d.deviceName)
+	d.kind = determineDeviceKind(d.index, d.deviceType)
 
 	switch d.kind {
-	case "amdgpu":
+	case DeviceKindAMDGPU:
 		fanPercent, temperature := deviceStats(d.index)
 		// Newer cards will idle with the fan off so just check if we got
 		// a good temperature reading
@@ -447,6 +564,48 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		break
 	}
 
+	// Check if temperature target is specified
+	if len(cfg.TempTargetInts) > 0 {
+		// Apply the first setting as a global setting
+		d.tempTarget = cfg.TempTargetInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.TempTargetInts {
+			if i == order {
+				d.tempTarget = uint32(cfg.TempTargetInts[order])
+			}
+		}
+		d.fanControlActive = true
+	}
+
+	// validate that we can actually do fan control
+	fanControlNotWorking := false
+	if d.tempTarget > 0 {
+		// validate that fan control is supported
+		if !d.fanControlSupported(d.kind) {
+			return nil, fmt.Errorf("temperature target of %v for device #%v; "+
+				"fan control is not supported on device kind %v", d.tempTarget,
+				index, d.kind)
+		}
+		if !d.fanTempActive {
+			minrLog.Errorf("DEV #%d ignoring temperature target of %v; "+
+				"could not get initial %v read", index, d.tempTarget, d.kind)
+			fanControlNotWorking = true
+		}
+		if !fanControlNotWorking {
+			err := amdgpuFanPermissionsValid(index)
+			if err != nil {
+				minrLog.Errorf("DEV #%d ignoring temperature target of %v; "+
+					"%v", index, d.tempTarget, err)
+				fanControlNotWorking = true
+			}
+		}
+		if fanControlNotWorking {
+			d.tempTarget = 0
+			d.fanControlActive = false
+		}
+	}
+
 	return d, nil
 }
 
@@ -629,6 +788,23 @@ func getDeviceInfo(id cl.CL_device_id,
 		return fmt.Sprintf("Failed to find OpenCL device info %s.\n", str)
 	}
 
+	switch name {
+	case cl.CL_DEVICE_TYPE:
+		var deviceTypeStr string
+
+		appendBitfield(cl.CL_bitfield(info.(cl.CL_device_type)),
+			cl.CL_bitfield(cl.CL_DEVICE_TYPE_CPU),
+			DeviceTypeCPU,
+			&deviceTypeStr)
+
+		appendBitfield(cl.CL_bitfield(info.(cl.CL_device_type)),
+			cl.CL_bitfield(cl.CL_DEVICE_TYPE_GPU),
+			DeviceTypeGPU,
+			&deviceTypeStr)
+
+		info = deviceTypeStr
+	}
+
 	strinfo := fmt.Sprintf("%v", info)
 
 	return strinfo
@@ -640,4 +816,6 @@ func (d *Device) Release() {
 	cl.CLReleaseCommandQueue(d.queue)
 	cl.CLReleaseMemObject(d.outputBuffer)
 	cl.CLReleaseContext(d.context)
+	// XXX need to check if/how the AMDGPU driver/device takes back
+	// automatic fan control like we do for ADL
 }
diff --git a/config.go b/config.go
index 3cd8fe5..242736a 100644
--- a/config.go
+++ b/config.go
@@ -11,6 +11,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/btcsuite/go-flags"
 	"github.com/decred/dcrutil"
@@ -33,9 +34,11 @@ var (
 	defaultLogDir        = filepath.Join(minerHomeDir, defaultLogDirname)
 	defaultAutocalibrate = 500
 
-	minIntensity = 8
-	maxIntensity = 31
-	maxWorkSize  = uint32(0xFFFFFFFF - 255)
+	minIntensity  = 8
+	maxIntensity  = 31
+	minTempTarget = uint32(60)
+	maxTempTarget = uint32(84)
+	maxWorkSize   = uint32(0xFFFFFFFF - 255)
 )
 
 type config struct {
@@ -43,10 +46,11 @@ type config struct {
 	ShowVersion bool `short:"V" long:"version" description:"Display version information and exit"`
 
 	// Config / log options
-	ConfigFile string `short:"C" long:"configfile" description:"Path to configuration file"`
-	LogDir     string `long:"logdir" description:"Directory to log output."`
-	DebugLevel string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`
-	ClKernel   string `short:"k" long:"kernel" description:"File with cl kernel to use"`
+	Experimental bool   `long:"experimental" description:"enable EXPERIMENTAL features such as setting a temperature target with (-t/--temptarget) which may DAMAGE YOUR DEVICE(S)."`
+	ConfigFile   string `short:"C" long:"configfile" description:"Path to configuration file"`
+	LogDir       string `long:"logdir" description:"Directory to log output."`
+	DebugLevel   string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`
+	ClKernel     string `short:"k" long:"kernel" description:"File with cl kernel to use"`
 
 	// Debugging options
 	Profile    string `long:"profile" description:"Enable HTTP profiling on given port -- NOTE port must be between 1024 and 65536"`
@@ -75,6 +79,8 @@ type config struct {
 	DeviceIDs         []int
 	Intensity         string `short:"i" long:"intensity" description:"Intensities (the work size is 2^intensity) per device. Single global value or a comma separated list."`
 	IntensityInts     []int
+	TempTarget        string `short:"t" long:"temptarget" description:"Target temperature in Celsius to maintain via automatic fan control. (Requires --experimental flag)"`
+	TempTargetInts    []uint32
 	WorkSize          string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity). Single global value or a comma separated list."`
 	WorkSizeInts      []uint32
 
@@ -420,6 +426,68 @@ func loadConfig() (*config, []string, error) {
 		}
 	}
 
+	// Check the temptarget if the user is setting that.
+	if len(cfg.TempTarget) > 0 {
+		if !cfg.Experimental {
+			err := fmt.Errorf("temperature targets / automatic fan control " +
+				"is an EXPERIMENTAL feature and requires the --experimental " +
+				"flag to acknowledge that you accept the risk of possibly " +
+				"DAMAGING YOUR DEVICE(S) due to software bugs")
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+		// Parse a list like -t 80,75
+		if strings.Contains(cfg.TempTarget, ",") {
+			specifiedTempTargets := strings.Split(cfg.TempTarget, ",")
+			cfg.TempTargetInts = make([]uint32, len(specifiedTempTargets))
+			for i := range specifiedTempTargets {
+				j, err := strconv.Atoi(specifiedTempTargets[i])
+				if err != nil {
+					err := fmt.Errorf("Could not convert temptarget "+
+						"(%v) to int: %s", specifiedTempTargets[i],
+						err.Error())
+					fmt.Fprintln(os.Stderr, err)
+					return nil, nil, err
+				}
+
+				cfg.TempTargetInts[i] = uint32(j)
+			}
+			// Use specified temptarget like -t 75
+		} else {
+			cfg.TempTargetInts = make([]uint32, 1)
+			i, err := strconv.Atoi(cfg.TempTarget)
+			if err != nil {
+				err := fmt.Errorf("Could not convert temptarget %v "+
+					"to int: %s", cfg.TempTarget, err.Error())
+				fmt.Fprintln(os.Stderr, err)
+				return nil, nil, err
+			}
+
+			cfg.TempTargetInts[0] = uint32(i)
+		}
+	}
+
+	if cfg.Experimental {
+		fmt.Fprintln(os.Stderr, "enabling EXPERIMENTAL features "+
+			"that may possibly DAMAGE YOUR DEVICE(S)")
+		time.Sleep(time.Second * 3)
+	}
+
+	for i := range cfg.TempTargetInts {
+		if cfg.TempTargetInts[i] < minTempTarget {
+			err := fmt.Errorf("Temp target %v is lower than minimum %v",
+				cfg.TempTargetInts[i], minTempTarget)
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+		if cfg.TempTargetInts[i] > maxTempTarget {
+			err := fmt.Errorf("Temp target %v is higher than maximum %v",
+				cfg.TempTargetInts[i], maxTempTarget)
+			fmt.Fprintln(os.Stderr, err)
+			return nil, nil, err
+		}
+	}
+
 	// Check the work size if the user is setting that.
 	if len(cfg.WorkSize) > 0 {
 		// Parse a list like -W 536870912,1073741824
diff --git a/cudevice.go b/cudevice.go
index a7d15e7..c070bcd 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -51,9 +51,14 @@ type Device struct {
 	index int
 	cuda  bool
 
-	deviceName    string
-	fanTempActive bool
-	kind          string
+	deviceName               string
+	deviceType               string
+	fanTempActive            bool
+	fanControlActive         bool
+	fanControlLastTemp       uint32
+	fanControlLastFanPercent uint32
+	kind                     string
+	tempTarget               uint32
 
 	// Items for CUDA device
 	cuDeviceID cu.Device
@@ -133,6 +138,12 @@ func deviceStats(index int) (uint32, uint32) {
 	return fanPercent, temperature
 }
 
+// unsupported -- just here for compilation
+func fanControlSet(index int, fanCur uint32, tempTargetType string,
+	fanChangeLevel string) {
+	minrLog.Errorf("NVML fanControl() reached but shouldn't have been")
+}
+
 func getInfo() ([]cu.Device, error) {
 	cu.Init(0)
 	ids := cu.DeviceGetCount()
@@ -198,13 +209,15 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 		index:       index,
 		cuDeviceID:  deviceID,
 		deviceName:  deviceID.Name(),
+		deviceType:  DeviceTypeGPU,
 		cuda:        true,
-		kind:        "nvidia",
+		kind:        DeviceKindNVML,
 		quit:        make(chan struct{}),
 		newWork:     make(chan *work.Work, 5),
 		workDone:    workDone,
 		fanPercent:  0,
 		temperature: 0,
+		tempTarget:  0,
 	}
 
 	d.cuInSize = 21
@@ -218,6 +231,40 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 		d.fanTempActive = true
 	}
 
+	// Check if temperature target is specified
+	if len(cfg.TempTargetInts) > 0 {
+		// Apply the first setting as a global setting
+		d.tempTarget = cfg.TempTargetInts[0]
+
+		// Override with the per-device setting if it exists
+		for i := range cfg.TempTargetInts {
+			if i == order {
+				d.tempTarget = uint32(cfg.TempTargetInts[order])
+			}
+		}
+		d.fanControlActive = true
+	}
+
+	// validate that we can actually do fan control
+	fanControlNotWorking := false
+	if d.tempTarget > 0 {
+		// validate that fan control is supported
+		if !d.fanControlSupported(d.kind) {
+			return nil, fmt.Errorf("temperature target of %v for device #%v; "+
+				"fan control is not supported on device kind %v", d.tempTarget,
+				index, d.kind)
+		}
+		if !d.fanTempActive {
+			minrLog.Errorf("DEV #%d ignoring temperature target of %v; "+
+				"could not get initial %v read", index, d.tempTarget, d.kind)
+			fanControlNotWorking = true
+		}
+		if fanControlNotWorking {
+			d.tempTarget = 0
+			d.fanControlActive = false
+		}
+	}
+
 	d.started = uint32(time.Now().Unix())
 
 	// Autocalibrate?
diff --git a/device.go b/device.go
index f0daace..1e1e47c 100644
--- a/device.go
+++ b/device.go
@@ -19,6 +19,31 @@ import (
 
 var chainParams = &chaincfg.MainNetParams
 
+// Constants for fan and temperature bits
+const (
+	ADLFanFailSafe            = uint32(80)
+	AMDGPUFanFailSafe         = uint32(204)
+	AMDGPUFanMax              = uint32(255)
+	AMDTempDivisor            = uint32(1000)
+	ChangeLevelNone           = "None"
+	ChangeLevelSmall          = "Small"
+	ChangeLevelLarge          = "Large"
+	DeviceKindAMDGPU          = "AMDGPU"
+	DeviceKindADL             = "ADL"
+	DeviceKindNVML            = "NVML"
+	DeviceKindUnknown         = "Unknown"
+	DeviceTypeCPU             = "CPU"
+	DeviceTypeGPU             = "GPU"
+	FanControlHysteresis      = uint32(3)
+	FanControlAdjustmentLarge = uint32(10)
+	FanControlAdjustmentSmall = uint32(5)
+	SeverityLow               = "Low"
+	SeverityHigh              = "High"
+	TargetLower               = "Lower"
+	TargetHigher              = "Raise"
+	TargetNone                = "None"
+)
+
 func (d *Device) updateCurrentWork() {
 	var w *work.Work
 	if d.hasWork {
@@ -73,6 +98,172 @@ func (d *Device) Run() {
 	}
 }
 
+// This is pretty hacky/proof-of-concepty
+func (d *Device) fanControl() {
+	d.Lock()
+	defer d.Unlock()
+	fanChange := 0
+	fanChangeLevel := ""
+	fanIntent := ""
+	fanLast := d.fanControlLastFanPercent
+	tempChange := 0
+	tempChangeLevel := ""
+	tempDirection := ""
+	tempLast := d.fanControlLastTemp
+	tempMinAllowed := d.tempTarget - FanControlHysteresis
+	tempMaxAllowed := d.tempTarget + FanControlHysteresis
+	tempSeverity := ""
+	tempTargetType := ""
+	firstRun := false
+
+	// Save the values we read for the next time the loop is run
+	fanCur := atomic.LoadUint32(&d.fanPercent)
+	tempCur := atomic.LoadUint32(&d.temperature)
+	d.fanControlLastFanPercent = fanCur
+	d.fanControlLastTemp = tempCur
+
+	// if this is our first run then set some more variables
+	if tempLast == 0 && fanLast == 0 {
+		fanLast = fanCur
+		tempLast = tempCur
+		firstRun = true
+	}
+
+	// Everything is OK so just return without adjustment
+	if tempCur <= tempMaxAllowed && tempCur >= tempMinAllowed {
+		minrLog.Tracef("DEV #%d within acceptable limits "+
+			"curTemp %v is above minimum %v and below maximum %v",
+			d.index, tempCur, tempMinAllowed, tempMaxAllowed)
+		return
+	}
+
+	// Lower the temperature of the device
+	if tempCur > tempMaxAllowed {
+		tempTargetType = TargetLower
+		if tempCur-tempMaxAllowed > FanControlHysteresis {
+			tempSeverity = SeverityHigh
+		} else {
+			tempSeverity = SeverityLow
+		}
+	}
+
+	// Raise the temperature of the device
+	if tempCur < tempMinAllowed {
+		tempTargetType = TargetHigher
+		if tempMaxAllowed-tempCur >= FanControlHysteresis {
+			tempSeverity = SeverityHigh
+		} else {
+			tempSeverity = SeverityLow
+		}
+	}
+
+	// we increased the fan to lower the device temperature last time
+	if fanLast < fanCur {
+		fanChange = int(fanCur) - int(fanLast)
+		fanIntent = TargetHigher
+	}
+	// we decreased the fan to raise the device temperature last time
+	if fanLast > fanCur {
+		fanChange = int(fanLast) - int(fanCur)
+		fanIntent = TargetLower
+	}
+	// we didn't make any changes
+	if fanLast == fanCur {
+		fanIntent = TargetNone
+	}
+
+	if fanChange == 0 {
+		fanChangeLevel = ChangeLevelNone
+	} else if fanChange == int(FanControlAdjustmentSmall) {
+		fanChangeLevel = ChangeLevelSmall
+	} else if fanChange == int(FanControlAdjustmentLarge) {
+		fanChangeLevel = ChangeLevelLarge
+	} else {
+		// XXX Seems the AMDGPU driver may not support all values or
+		// changes values underneath us
+		minrLog.Tracef("DEV #%d fan changed by an unexpected value %v", d.index,
+			fanChange)
+		if fanChange < int(FanControlAdjustmentSmall) {
+			fanChangeLevel = ChangeLevelSmall
+		} else {
+			fanChangeLevel = ChangeLevelLarge
+		}
+	}
+
+	if tempLast < tempCur {
+		tempChange = int(tempCur) - int(tempLast)
+		tempDirection = "Up"
+	}
+	if tempLast > tempCur {
+		tempChange = int(tempLast) - int(tempCur)
+		tempDirection = "Down"
+	}
+	if tempLast == tempCur {
+		tempDirection = "Stable"
+	}
+
+	if tempChange == 0 {
+		tempChangeLevel = ChangeLevelNone
+	} else if tempChange > int(FanControlHysteresis) {
+		tempChangeLevel = ChangeLevelLarge
+	} else {
+		tempChangeLevel = ChangeLevelSmall
+	}
+
+	minrLog.Tracef("DEV #%d firstRun %v fanChange %v fanChangeLevel %v "+
+		"fanIntent %v tempChange %v tempChangeLevel %v tempDirection %v "+
+		" tempSeverity %v tempTargetType %v", d.index, firstRun, fanChange,
+		fanChangeLevel, fanIntent, tempChange, tempChangeLevel, tempDirection,
+		tempSeverity, tempTargetType)
+
+	// We have no idea if the device is starting cold or re-starting hot
+	// so only adjust the fans upwards a little bit.
+	if firstRun {
+		if tempTargetType == TargetLower {
+			fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelSmall)
+			return
+		}
+	}
+
+	// we didn't do anything last time so just match our change to the severity
+	if fanIntent == TargetNone {
+		if tempSeverity == SeverityLow {
+			fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelSmall)
+		} else {
+			fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelLarge)
+		}
+	}
+
+	// XXX could do some more hysteresis stuff here
+
+	// we tried to raise or lower the temperature but it didn't work so
+	// do it some more according to the severity level
+	if fanIntent == tempTargetType {
+		if tempSeverity == SeverityLow {
+			fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelSmall)
+		} else {
+			fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelLarge)
+		}
+	}
+
+	// we raised or lowered the temperature too much so just do a small
+	// adjustment
+	if fanIntent != tempTargetType {
+		fanControlSet(d.index, fanCur, tempTargetType, ChangeLevelSmall)
+	}
+}
+
+func (d *Device) fanControlSupported(kind string) bool {
+	fanControlDrivers := []string{DeviceKindADL, DeviceKindAMDGPU}
+
+	for _, driver := range fanControlDrivers {
+		if driver == kind {
+			return true
+		}
+	}
+	return false
+}
+
 func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	d.Lock()
 	defer d.Unlock()
@@ -89,7 +280,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	// work check are considered to be hardware errors.
 	hashNum := blockchain.ShaHashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
-		minrLog.Errorf("DEV #%d: Hardware error found, hash %v above "+
+		minrLog.Errorf("DEV #%d Hardware error found, hash %v above "+
 			"minimum target %064x", d.index, hash, d.work.Target.Bytes())
 		d.invalidShares++
 		return
@@ -100,10 +291,10 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	if !cfg.Benchmark {
 		// Assess versus the pool or daemon target.
 		if hashNum.Cmp(d.work.Target) > 0 {
-			minrLog.Debugf("DEV #%d: Hash %v bigger than target %032x (boo)",
+			minrLog.Debugf("DEV #%d Hash %v bigger than target %032x (boo)",
 				d.index, hash, d.work.Target.Bytes())
 		} else {
-			minrLog.Infof("DEV #%d: Found hash with work below target! %v (yay)",
+			minrLog.Infof("DEV #%d Found hash with work below target! %v (yay)",
 				d.index, hash)
 			d.validShares++
 			d.workDone <- data
@@ -158,9 +349,9 @@ func (d *Device) UpdateFanTemp() {
 	if d.fanTempActive {
 		// For now amd and nvidia do more or less the same thing
 		// but could be split up later.  Anything else (Intel) just
-		// don't do anything.
+		// doesn't do anything.
 		switch d.kind {
-		case "adl", "amdgpu", "nvidia":
+		case DeviceKindADL, DeviceKindAMDGPU, DeviceKindNVML:
 			fanPercent, temperature := deviceStats(d.index)
 			atomic.StoreUint32(&d.fanPercent, fanPercent)
 			atomic.StoreUint32(&d.temperature, temperature)
diff --git a/glide.lock b/glide.lock
index e616159..6ec7838 100644
--- a/glide.lock
+++ b/glide.lock
@@ -49,4 +49,8 @@ imports:
   version: 9859625390900fa7029ce8cfdeb430f239e502d4
   subpackages:
   - cuda/cu
+- name: golang.org/x/sys
+  version: 8f0908ab3b2457e2e15403d3697c9ef5cb4b57a9
+  subpackages:
+  - unix
 testImports: []
diff --git a/miner.go b/miner.go
index 541dc7a..2724753 100644
--- a/miner.go
+++ b/miner.go
@@ -189,6 +189,9 @@ func (m *Miner) printStatsThread() {
 		for _, d := range m.devices {
 			d.UpdateFanTemp()
 			d.PrintStats()
+			if d.fanControlActive {
+				d.fanControl()
+			}
 		}
 
 		select {

From 72bee914c0bfa384994491c6303fddc7eb6b2e40 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 27 Sep 2016 17:06:55 -0400
Subject: [PATCH 068/150] Improve cpu usage with CUDA.

Switch to fork of mumax/3/cuda/cu that adds calls to the CUDA runtime
api.  ccminer uses the runtime api and so by using that we can use the
same methods to get similar cpu performance.

This also removes the use of CUDA contexts complete.

There are still a few calls (for getting device count, version, etc)
that use the older driver api but you are allowed to mix and match and
those can be converted at a later date if desired.

Closes #86
---
 README.windows        |  2 +-
 cudakernel_static.go  |  2 +-
 cudakernel_windows.go |  2 +-
 cudevice.go           | 25 ++++++++++---------------
 glide.lock            |  8 ++++----
 glide.yaml            |  3 +++
 6 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/README.windows b/README.windows
index 5911d2d..850aef3 100644
--- a/README.windows
+++ b/README.windows
@@ -29,7 +29,7 @@ Compiling gominer on windows requires installation in following order:
 * ensure that GOPATH is set somewhere sane (e.g. %HOMEPATH%\go)
 
 Install mumax/3 (in cmd.exe):
-$ go get github.com/mumax/3
+$ go get github.com/jcvernaleo/3
 
 Install glide (in git-bash):
 $ go get github.com/Masterminds/glide
diff --git a/cudakernel_static.go b/cudakernel_static.go
index d574f75..9d716a5 100644
--- a/cudakernel_static.go
+++ b/cudakernel_static.go
@@ -9,7 +9,7 @@ package main
 */
 import "C"
 import (
-	"github.com/mumax/3/cuda/cu"
+	"github.com/jcvernaleo/3/cuda/cu"
 	"unsafe"
 )
 
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
index a8cadab..4f28cb9 100644
--- a/cudakernel_windows.go
+++ b/cudakernel_windows.go
@@ -7,7 +7,7 @@ import (
 	"syscall"
 	"unsafe"
 
-	"github.com/mumax/3/cuda/cu"
+	"github.com/jcvernaleo/3/cuda/cu"
 )
 
 var (
diff --git a/cudevice.go b/cudevice.go
index c070bcd..6a749a9 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -19,7 +19,7 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/mumax/3/cuda/cu"
+	"github.com/jcvernaleo/3/cuda/cu"
 
 	"github.com/decred/gominer/nvml"
 	"github.com/decred/gominer/util"
@@ -61,9 +61,7 @@ type Device struct {
 	tempTarget               uint32
 
 	// Items for CUDA device
-	cuDeviceID cu.Device
-	cuContext  cu.Context
-	//cuInput        cu.DevicePtr
+	cuDeviceID     cu.Device
 	cuInSize       int64
 	cuOutputBuffer []float64
 
@@ -280,21 +278,19 @@ func (d *Device) runDevice() error {
 	d.extraNonce += uint32(d.index) << 24
 	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
 
-	// Need to have this stuff here for a ctx vs thread issue.
+	// Need to have this stuff here for a device vs thread issue.
 	runtime.LockOSThread()
 
-	// Create the CU context
-	d.cuContext = cu.CtxCreate(cu.CTX_BLOCKING_SYNC, d.cuDeviceID)
-
-	// Allocate the input region
-	d.cuContext.SetCurrent()
+	cu.DeviceReset()
+	cu.SetDevice(d.cuDeviceID)
+	cu.SetDeviceFlags(cu.DeviceScheduleBlockingSync)
 
 	// kernel is built with nvcc, not an api call so must be done
 	// at compile time.
 
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
-	nonceResultsH := cu.MemAllocHost(d.cuInSize * 4)
-	nonceResultsD := cu.MemAlloc(d.cuInSize * 4)
+	nonceResultsH := cu.MallocHost(d.cuInSize * 4)
+	nonceResultsD := cu.Malloc(d.cuInSize * 4)
 	defer cu.MemFreeHost(nonceResultsH)
 	defer nonceResultsD.Free()
 
@@ -432,7 +428,6 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 }
 
 func (d *Device) Release() {
-	d.cuContext.SetCurrent()
-	//d.cuInput.Free()
-	cu.CtxDestroy(&d.cuContext)
+	cu.SetDevice(d.cuDeviceID)
+	cu.DeviceReset()
 }
diff --git a/glide.lock b/glide.lock
index 6ec7838..4d9d019 100644
--- a/glide.lock
+++ b/glide.lock
@@ -1,5 +1,5 @@
-hash: 91c7f7aacbc4b5f82b14c9de3212c07257421b40c37926f571c3bc79f19c6060
-updated: 2016-08-16T11:10:41.055231248-04:00
+hash: a629ff7ac31d9554df88c93ed389d830fda2bb73a5d2c53b73d362dac0538e93
+updated: 2016-09-28T11:19:49.975498348-04:00
 imports:
 - name: github.com/btcsuite/btclog
   version: f96df2375f37300305f329b8e5258764b4f19a7f
@@ -45,8 +45,8 @@ imports:
   version: b0909d3f798b97a03c9e77023f97a5301a2a7900
   subpackages:
   - edwards25519
-- name: github.com/mumax/3
-  version: 9859625390900fa7029ce8cfdeb430f239e502d4
+- name: github.com/jcvernaleo/3
+  version: ff7ea4431a97bb2b389739cb9087bbc11faa1fe0
   subpackages:
   - cuda/cu
 - name: golang.org/x/sys
diff --git a/glide.yaml b/glide.yaml
index 7728430..b454f1a 100644
--- a/glide.yaml
+++ b/glide.yaml
@@ -16,3 +16,6 @@ import:
   - chaincfg/chainhash
   - wire
 - package: github.com/decred/dcrutil
+- package: github.com/jcvernaleo/3
+  subpackages:
+  - cuda/cu

From c8f35e096406738c951f2d7a15818dca138a962c Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Wed, 28 Sep 2016 14:55:25 -0500
Subject: [PATCH 069/150] add some default Windows CFLAGS/LDFLAGS and remove
 unixy code (#117)

---
 adl/adl.go      |  3 ++-
 cl/cgo_flags.go |  2 ++
 cldevice.go     | 11 +++++++----
 glide.lock      |  6 +-----
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/adl/adl.go b/adl/adl.go
index 4850feb..52ddd83 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -1,9 +1,10 @@
 package adl
 
 /*
-// XXX all the C implementations use dlopen()
+// XXX we should really be using dlopen/LoadLibrary like C miners do
 #cgo linux CFLAGS: -DLINUX
 #cgo linux LDFLAGS: -latiadlxx -ldl
+#cgo windows LDFLAGS: -LC:/appsdk/lib/x86_64 -latiadlxx
 #include <stddef.h>
 #include <stdbool.h>
 #include <adl_sdk.h>
diff --git a/cl/cgo_flags.go b/cl/cgo_flags.go
index a5363dd..1a42d6c 100644
--- a/cl/cgo_flags.go
+++ b/cl/cgo_flags.go
@@ -2,7 +2,9 @@ package cl
 
 /*
 #cgo CFLAGS: -I CL
+#cgo windows CFLAGS: -IC:/appsdk/include
 #cgo !darwin LDFLAGS: -lOpenCL
 #cgo darwin LDFLAGS: -framework OpenCL
+#cgo windows LDFLAGS: -LC:/appsdk/lib/x86_64
 */
 import "C"
diff --git a/cldevice.go b/cldevice.go
index 1c29572..6c6497e 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -23,8 +23,6 @@ import (
 	"github.com/decred/gominer/cl"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
-
-	"golang.org/x/sys/unix"
 )
 
 // Return the GPU library in use.
@@ -65,9 +63,14 @@ func amdgpuFanPercentToValue(percent uint32) uint32 {
 func amdgpuFanPermissionsValid(index int) error {
 	path := amdgpuGetSysfsPath(index, "fan")
 
-	err := unix.Access(path, unix.W_OK)
+	file, err := os.OpenFile(path, os.O_WRONLY, 0666)
+	file.Close()
 	if err != nil {
-		return fmt.Errorf("path %v is not writable", path)
+		if os.IsPermission(err) {
+			return fmt.Errorf("path %v is not writable", path)
+		} else {
+			return fmt.Errorf("path %v unusable %v", path, err)
+		}
 	}
 
 	return nil
diff --git a/glide.lock b/glide.lock
index 4d9d019..b60752d 100644
--- a/glide.lock
+++ b/glide.lock
@@ -1,5 +1,5 @@
 hash: a629ff7ac31d9554df88c93ed389d830fda2bb73a5d2c53b73d362dac0538e93
-updated: 2016-09-28T11:19:49.975498348-04:00
+updated: 2016-09-28T14:47:51.839078623-05:00
 imports:
 - name: github.com/btcsuite/btclog
   version: f96df2375f37300305f329b8e5258764b4f19a7f
@@ -49,8 +49,4 @@ imports:
   version: ff7ea4431a97bb2b389739cb9087bbc11faa1fe0
   subpackages:
   - cuda/cu
-- name: golang.org/x/sys
-  version: 8f0908ab3b2457e2e15403d3697c9ef5cb4b57a9
-  subpackages:
-  - unix
 testImports: []

From 33049de5747031c957bc9debfb411e426b6603aa Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 29 Sep 2016 09:23:17 -0400
Subject: [PATCH 070/150] Update sample config with all new options.

Closes #118
---
 sample-gominer.conf | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/sample-gominer.conf b/sample-gominer.conf
index 124f83b..c74e8e9 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -3,7 +3,6 @@
 ; ------------------------------------------------------------------------------
 ; General settings
 ; ------------------------------------------------------------------------------
-;
 
 ; Location of logfiles.
 ; logdir=/some/path
@@ -20,6 +19,9 @@
 ; proxyuser=
 ; proxypass=
 
+; Device to use, can be a comma seperated list for multiple devices.
+; devices=0,1
+
 ; ------------------------------------------------------------------------------
 ; Network settings
 ; ------------------------------------------------------------------------------
@@ -30,6 +32,10 @@
 ; Use simnet (cannot be used with testnet=1).
 ; simnet=1
 
+; ------------------------------------------------------------------------------
+; Profiling settings
+; ------------------------------------------------------------------------------
+
 ; Enable full profile on specified port.
 ; profile=1234
 
@@ -43,11 +49,6 @@
 ; RPC client settings
 ; ------------------------------------------------------------------------------
 
-; Connect via a SOCKS5 proxy.
-; proxy=127.0.0.1:9050
-; proxyuser=
-; proxypass=
-
 ; Username and password to authenticate connections to a Decred RPC server
 ; (usually dcrd)
 ; rpcuser=
@@ -62,13 +63,19 @@
 ; Disable tls for rpc
 ; notls=1
 
+; Do not verify tls cert (not recommended!)
+; skipverify=1
+
 ; ------------------------------------------------------------------------------
 ; Mining settings
 ; ------------------------------------------------------------------------------
 
-; Location of kernel to use for mining.
+; Location of kernel to use for mining (opencl only).
 ; kernel=./blake256.cl
 
+; Autocalibrate time target in ms to spend executing hashes for each iteration.
+; autocalibrate=40
+
 ; Intensity (the work size is 2^intensity) with one entry per device.
 ; intensity=26
 ; intensity=25
@@ -88,3 +95,17 @@
 
 ; Password for mining pool.
 ; poolpass=
+
+; ------------------------------------------------------------------------------
+; Experimental settings
+; Settings in this section are new and/or dangerous and have the potential to
+; damage your hardware if you are not careful
+; ------------------------------------------------------------------------------
+
+; Enable experimental options
+; experimental=1
+
+; Target temperature in Celsius for GPU to maintain (must be within the range
+; 60C to 84C).
+; temptarget=80
+

From 5350a8fa286a1a6cf9f60b12ebba997fae526da9 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 29 Sep 2016 11:51:23 -0500
Subject: [PATCH 071/150] add more Windows details and some general
 improvements (#120)

---
 README.md      | 141 +++++++++++++++++++++++++++++++++++++------------
 README.windows |  55 -------------------
 2 files changed, 106 insertions(+), 90 deletions(-)
 delete mode 100644 README.windows

diff --git a/README.md b/README.md
index f59bbfa..5cfda06 100644
--- a/README.md
+++ b/README.md
@@ -1,65 +1,136 @@
 # gominer
 
-## Installation
+gominer is an application for performing Proof-of-Work (PoW) mining on the
+Decred network.  It supports solo and stratum/pool mining using CUDA and
+OpenCL devices.
 
-You need to have the OpenCL or CUDA development libraries
-installed (depending on which version of gominer you would like to
-build) . You also need the runtime and drivers for the one you plan
-on running (CUDA for nvidia, OpenCL for anything) To download and
-build gominer, run:
+## Downloading
 
-```
-go get -u github.com/Masterminds/glide
-mkdir -p $GOPATH/src/github.com/decred
-cd $GOPATH/src/github.com/decred
-git clone  https://github.com/decred/gominer.git
-cd gominer
-glide i
-```
+Linux and Windows 64-bit binaries may be downloaded from:
+
+[https://github.com/decred/decred-binaries/releases/latest](https://github.com/decred/decred-binaries/releases/latest)
+
+## Running
+
+Benchmark mode:
 
-For OpenCL:
 ```
-go install -tags 'opencl'
+gominer -B
 ```
 
-For OpenCL with AMD Device Library (ADL) support:
+Solo mining on mainnet using dcrd running on the local host:
+
 ```
-go install -tags 'opencladl'
+gominer -u myusername -P hunter2
 ```
 
-For CUDA with NVIDIA Management Library (NVML) support:
+Stratum/pool mining:
+
 ```
-make
-go install -tags 'cuda'
+gominer -o stratum+tcp://pool:port -m username -n password
 ```
 
-On Ubuntu 16.04 you can install the necessary OpenCL packages (for
-Intel Graphics cards) and CUDA libraries with:
+## Linux Build Pre-Requisites
+
+You will either need to install CUDA for NVIDIA graphics cards or OpenCL
+library/headers that support your device such as: AMDGPU-PRO (for newer AMD
+cards), Beignet (for Intel Graphics), or Catalyst (for older AMD cards).
+
+For example, on Ubuntu 16.04 you can install the necessary OpenCL packages (for
+Intel Graphics) and CUDA libraries with:
 
 ```
 sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
-Other graphics cards will need different libraries.  We have built
-successfully on Ubuntu 16.04 with go1.6.2, go1.7.1, g++ 5.4.0 and
-beignet-dev 1.1.1-2 although other combinations should work as well.
+gominer has been built successfully on Ubuntu 16.04 with go1.6.2, go1.7.1,
+g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
+well.
 
-## Running
+## Linux Build Instructions
 
-Run for benchmark:
+To download and build gominer, run:
 
 ```
-gominer -B
+go get -u github.com/Masterminds/glide
+mkdir -p $GOPATH/src/github.com/decred
+cd $GOPATH/src/github.com/decred
+git clone  https://github.com/decred/gominer.git
+cd gominer
+glide install
 ```
 
-Run for real mining:
-
+For CUDA with NVIDIA Management Library (NVML) support:
 ```
-gominer -u myusername -P hunter2
+make
 ```
 
-To mine on a pool:
-
+For OpenCL (autodetects AMDGPU support):
 ```
-gominer -o stratum+tcp://pool:port -m username -n password
+go build -tags opencl
+```
+
+For OpenCL with AMD Device Library (ADL) support:
 ```
+go build -tags opencladl
+```
+
+## Windows Build Pre-Requisites
+
+- Download and install the official Go Windows binaries from [https://golang.dl/](https://golang.org/dl/)
+- Download and install Git for Windows from [https://git-for-windows.github.io/](https://git-for-windows.github.io/)
+  * Make sure to select the Git-Bash option when prompted
+- Download the MinGW-w64 installer from [https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win32/Personal Builds/mingw-builds/installer/](https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/)
+  * Select the x64 toolchain and use defaults for the other questions
+- Set the environment variable GOPATH to C:\Users\username\go
+- Check that the GOROOT environment variable is set to C:\Go
+  * This should have been done by the Go installer
+- Add the following locations to your PATH C:\Users\username\go\bin;C:\Go\bin
+- Add C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin to your PATH (This is the latest release as of 2016-09-29)
+- go get github.com/Masterminds/glide
+  * You should be able to type ```glide``` and get glide's usage display.  If not, double check the steps above
+- go get github.com/decred/gominer
+  * Compilation will most likely fail which can be safely ignored for now.
+- Change to the gominer directory
+  * ```cd $GOPATH/src/github.com/decred/gominer```
+- Install dependencies via glide
+  * ```glide install```
+
+### CUDA Specific Steps
+
+- Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)
+- Add C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin to your PATH
+- Install CUDA 7.0 from [https://developer.nvidia.com/cuda-toolkit-70](https://developer.nvidia.com/cuda-toolkit-70)
+- Add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\bin to your PATH
+
+### OpenCL/ADL Specific Steps
+
+- Download AMD APP SDK v3.0 from [http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
+  * Samples may be unselected from the install to save space as only the libraries and headers are needed
+- Copy or Move C:\Program Files (x86)\AMD APP SDK\3.0 to C:\appsdk
+  * Ensure the folders C:\appsdk\include and C:\appsdk\lib are populated
+- Change to the library directory C:\appsdk\lib\x86_64
+  * ```cd C:\appsdk\lib\x86_64```
+- Copy and prepare the ADL library for linking
+  * ```copy c:\Windows\SysWOW64\atiadlxx.dll .```
+  * ```gendef atiadlxx.dll```
+  * ```dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def```
+
+## Windows Build Instructions
+
+### CUDA
+
+- Using git-bash:
+  * ```cd $GOPATH/src/github.com/decred/gominer```
+  * ```mingw32-make.exe```
+- Copy dependencies:
+  * ```copy obj/decred.dll .```
+  * ```copy nvidia/NVSMI/nvml.dll .```
+
+### OpenCL / OpenCL w/ADL support
+
+- For OpenCL:
+  * ```go build -tags opencl```
+
+- For OpenCL with AMD Device Library (ADL) support:
+  * ```go build -tags opencladl```
diff --git a/README.windows b/README.windows
deleted file mode 100644
index 850aef3..0000000
--- a/README.windows
+++ /dev/null
@@ -1,55 +0,0 @@
-Windows support for gominer requires very specific build steps.  Unlike Linux
-Windows requires dynamically linked libraries and some additional hoops to jump
-through.  Great pains went into making the build as simple as possible and
-unfortunately it is still complex and requires specific versions.  Effort went
-into using less painful toolchains but those ended all in failure.  It is
-advisable to install the toolchain on a fresh machines; leftovers are going to
-make stuff not work right.  Please accept the dark magic used here as gospel
-for any other way is a sin.
-
-========================================================================
-MAKE SURE THERE ARE NO OTHER GCC/BINUTILS IN THE PATH!
-MAKE SURE THERE ARE NO LEFTOVER INCLUDES AND LIBRARIES!
-ONLY USE MINGW64!
-YOU HAVE BEEN WARNED!
-========================================================================
-
-Note that the build system copies various directories into the relative path in
-order to work arround issues in cgo/windows.
-
-Compiling gominer on windows requires installation in following order:
-* Microsoft Visual Studio 2013 (cl.exe, used by nvcc)
-* NVIDIA CUDA V7.0 drivers (includes nvml.dll) and GPU toolkit (nvcc, headers, libs etc)
-* Official Go 1.7.1 installed in the default location and with GOPATH set
-* Git-Bash (for the shell and to check out code)
-* MingW64 (only mingw64 with defaults works!) http://sourceforge.net/projects/mingw-w64/
-
-* Setup environment (in advanced windows settings!):
-* ensure that cl.exe, go.exe and git.exe are in PATH
-* ensure that GOPATH is set somewhere sane (e.g. %HOMEPATH%\go)
-
-Install mumax/3 (in cmd.exe):
-$ go get github.com/jcvernaleo/3
-
-Install glide (in git-bash):
-$ go get github.com/Masterminds/glide
-
-Download gominer source (in git-bash; will fail compilation but that is ok):
-$ go get github.com/decred/gominer
-
-Setup vendoring for gominer
-$ cd $GOPATH/src/github.com/decred/gominer
-$ glide i
-
-Building CUDA (in git-bash):
-$ cd $GOPATH/src/github.com/decred/gominer
-$ mingw32-make.exe
-
-Distribution requires 3 files:
-* gominer.exe
-* decred.dll (relative path -> obj/decred.dll)
-* nvml.dll (relative path -> nvidia/NVSMI/nvml.dll)
-
-NOTE: CUDA does not work over remote desktop.  If one wishes to remotely manage
-a windows machine that uses CUDA one MUST use VNC instead.  This is a CUDA
-issue, not a gominer issue.

From 8c540b325636b7e225e0a39ffcf99a64ec01c70b Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 4 Oct 2016 11:34:56 -0400
Subject: [PATCH 072/150] Bump for v0.5.0

---
 version.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/version.go b/version.go
index e5a9c8f..f4c0569 100644
--- a/version.go
+++ b/version.go
@@ -31,8 +31,8 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 4
-	appPatch uint = 1
+	appMinor uint = 5
+	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet
 	// per the semantic versioning spec.

From 13cecddb128cd67f6d4249205122eda255f3c221 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 4 Nov 2016 07:20:45 -0400
Subject: [PATCH 073/150] Bump for v0.6.0

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index f4c0569..9dbc512 100644
--- a/version.go
+++ b/version.go
@@ -31,7 +31,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 5
+	appMinor uint = 6
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From 0b0037ad0d5071249ac554ca72cf2d779574a758 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 22 Nov 2016 11:18:51 -0500
Subject: [PATCH 074/150] Updates to goclean.sh

Switch to use gometalinter like the other projects.

Only list static checkers we actually use.

Fix ordering of imports caught by goimports.

Closes #128
---
 .travis.yml          |  6 ++----
 cudakernel_static.go |  3 ++-
 goclean.sh           | 19 +++++++++++--------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f050815..000b57d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,10 +10,8 @@ before_install:
 install:
   - go get -v github.com/Masterminds/glide
   - glide install
-  - go get -v golang.org/x/tools/cmd/cover
-  - go get -v github.com/bradfitz/goimports
-  - go get -v github.com/golang/lint/golint
-  - go get -v github.com/davecgh/go-spew/spew
+  - go get -v github.com/alecthomas/gometalinter
+  - gometalinter --install
 script:
   - export PATH=$PATH:$HOME/gopath/bin
   - ./goclean.sh
diff --git a/cudakernel_static.go b/cudakernel_static.go
index 9d716a5..81f5203 100644
--- a/cudakernel_static.go
+++ b/cudakernel_static.go
@@ -9,8 +9,9 @@ package main
 */
 import "C"
 import (
-	"github.com/jcvernaleo/3/cuda/cu"
 	"unsafe"
+
+	"github.com/jcvernaleo/3/cuda/cu"
 )
 
 func cudaPrecomputeTable(input *[192]byte) {
diff --git a/goclean.sh b/goclean.sh
index 2d0a89b..c501dcc 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -1,17 +1,20 @@
 #!/bin/bash
 # The script does automatic checking on a Go package and its sub-packages, including:
 # 1. gofmt         (http://golang.org/cmd/gofmt/)
-# 2. golint        (https://github.com/golang/lint)
-# 3. go vet        (http://golang.org/cmd/vet)
-# 4. race detector (http://blog.golang.org/race-detector)
-# 5. test coverage (http://blog.golang.org/cover)
+# 2. go vet        (http://golang.org/cmd/vet)
+# 3. goimports     (https://github.com/bradfitz/goimports)
+
+# gometalinter (github.com/alecthomas/gometalinter) is used to run each each
+# static checker.
 
 set -ex
 
 # Automatic checks
+test -z "$(gometalinter --disable-all \
+--enable=gofmt \
+--enable=vet \
+--enable=goimports \
+--deadline=45s $(glide novendor) | tee /dev/stderr)"
 test -z "$(go fmt $(glide novendor) | tee /dev/stderr)"
-# TODO
-#test -z "$(for package in $(glide novendor); do golint $package; done | grep -v 'ALL_CAPS\|OP_\|NewFieldVal' | tee /dev/stderr)"
 test -z "$(go vet $(glide novendor) 2>&1 | tee /dev/stderr)"
-# TODO
-#env GORACE="halt_on_error=1" go test -v -race $(glide novendor)
+

From 64044f254e42c5efe4dd0f51d5b87c3b4509c500 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 16 Dec 2016 13:26:21 -0500
Subject: [PATCH 075/150] Bump for v0.7.0

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index 9dbc512..27a2ddc 100644
--- a/version.go
+++ b/version.go
@@ -31,7 +31,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 6
+	appMinor uint = 7
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From d7c6fdd0c34e02c7a5a015c15120dbac4ada0b01 Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Mon, 2 Jan 2017 10:28:44 -0600
Subject: [PATCH 076/150] only initialize device libraries once (#132)

---
 adl/adl.c      | 43 +++++++++++++++++++++++++++++++++----------
 adl/adl.go     | 10 ++++++++++
 cladldevice.go |  5 +++++
 cudevice.go    | 14 ++++++++------
 device.go      |  1 +
 5 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/adl/adl.c b/adl/adl.c
index 65224b2..4db72c9 100644
--- a/adl/adl.c
+++ b/adl/adl.c
@@ -16,11 +16,16 @@
 
 #define MAX_GPUDEVICES 16
 
+static int iNumberAdapters;
+static LPAdapterInfo lpInfo = NULL;
+static bool adl_active = 0;
+
 // declarations in adl_functions.h for these are formatted for dynamic loading
 int ADL_Adapter_AdapterInfo_Get(LPAdapterInfo lpInfo, int iInputSize);
 int ADL_Adapter_ID_Get(int iAdapterIndex, int *lpAdapterID);
 int ADL_Adapter_NumberOfAdapters_Get(int *lpNumAdapters);
 int ADL_Main_Control_Create(ADL_MAIN_MALLOC_CALLBACK callback, int iEnumConnectedAdapters);
+int ADL_Main_Control_Destroy();
 int ADL_Overdrive5_FanSpeed_Get(int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
 int ADL_Overdrive5_FanSpeed_Set(int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
 int ADL_Overdrive5_FanSpeedToDefault_Set(int iAdapaterIndex, int iThermalControllerIndex);
@@ -52,22 +57,17 @@ static void __stdcall ADL_Main_Memory_Free (void **lpBuffer)
   }
 }
 
-int doADLCommand(int deviceid, char field[64], int arg) {
-  int result, i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0;
-  int iNumberAdapters;
-  struct gpu_adapters adapters[MAX_GPUDEVICES], vadapters[MAX_GPUDEVICES];
-  bool devs_match = true;
-  ADLBiosInfo BiosInfo;
-  LPAdapterInfo lpInfo = NULL;
+void init_adl() {
+  int result;
 
   if (ADL_OK != ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1)) {
-    return 0;
+    return;
   }
 
   // Obtain the number of adapters for the system
   result = ADL_Adapter_NumberOfAdapters_Get(&iNumberAdapters);
   if (result != ADL_OK) {
-    return 0;
+    return;
   }
 
   if (iNumberAdapters > 0) {
@@ -78,9 +78,32 @@ int doADLCommand(int deviceid, char field[64], int arg) {
     // Get the AdapterInfo structure for all adapters in the system
     result = ADL_Adapter_AdapterInfo_Get (lpInfo, sizeof (AdapterInfo) * iNumberAdapters);
     if (result != ADL_OK) {
-      return 0;
+      return;
     }
   } else {
+    return;
+  }
+
+  /* Flag adl as active if any card is successfully activated */
+  adl_active = true;
+
+  return;
+}
+
+void free_adl(void)
+{
+  adl_active = false;
+  ADL_Main_Memory_Free((void **)&lpInfo);
+  ADL_Main_Control_Destroy();
+}
+
+int doADLCommand(int deviceid, char field[64], int arg) {
+  int result, i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0;
+  struct gpu_adapters adapters[MAX_GPUDEVICES], vadapters[MAX_GPUDEVICES];
+  bool devs_match = true;
+  ADLBiosInfo BiosInfo;
+
+  if (!adl_active) {
     return 0;
   }
 
diff --git a/adl/adl.go b/adl/adl.go
index 52ddd83..e40cb70 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -8,6 +8,8 @@ package adl
 #include <stddef.h>
 #include <stdbool.h>
 #include <adl_sdk.h>
+void init_adl();
+void free_adl();
 int getADLFanPercent(int deviceid);
 int getADLTemp(int deviceid);
 int setADLFanAutoManage(int deviceid);
@@ -15,6 +17,14 @@ int setADLFanPercent(int deviceid, int fanPercent);
 */
 import "C"
 
+func Init() {
+	C.init_adl()
+}
+
+func Release() {
+	C.free_adl()
+}
+
 // DeviceFanGetPercent fetches and returns fan utilization for a device index
 func DeviceFanGetPercent(index int) uint32 {
 	fanPercent := uint32(0)
diff --git a/cladldevice.go b/cladldevice.go
index 791efe9..1218d14 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -406,6 +406,10 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 
 	switch d.kind {
 	case DeviceKindADL:
+		if !deviceLibraryInitialized {
+			adl.Init()
+			deviceLibraryInitialized = true
+		}
 		fanPercent, temperature := deviceStats(d.index)
 		// Newer cards will idle with the fan off so just check if we got
 		// a good temperature reading
@@ -662,4 +666,5 @@ func (d *Device) Release() {
 	cl.CLReleaseMemObject(d.outputBuffer)
 	cl.CLReleaseContext(d.context)
 	adl.DeviceFanAutoManage(d.index)
+	adl.Release()
 }
diff --git a/cudevice.go b/cudevice.go
index 6a749a9..8192e57 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -107,12 +107,6 @@ func deviceStats(index int) (uint32, uint32) {
 	fanPercent := uint32(0)
 	temperature := uint32(0)
 
-	err := nvml.Init()
-	if err != nil {
-		minrLog.Errorf("NVML Init error: %v", err)
-		return fanPercent, temperature
-	}
-
 	dh, err := nvml.DeviceGetHandleByIndex(index)
 	if err != nil {
 		minrLog.Errorf("NVML DeviceGetHandleByIndex error: %v", err)
@@ -220,6 +214,14 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 
 	d.cuInSize = 21
 
+	if !deviceLibraryInitialized {
+		err := nvml.Init()
+		if err != nil {
+			minrLog.Errorf("NVML Init error: %v", err)
+		} else {
+			deviceLibraryInitialized = true
+		}
+	}
 	fanPercent, temperature := deviceStats(d.index)
 	// Newer cards will idle with the fan off so just check if we got
 	// a good temperature reading
diff --git a/device.go b/device.go
index 1e1e47c..c63557a 100644
--- a/device.go
+++ b/device.go
@@ -18,6 +18,7 @@ import (
 )
 
 var chainParams = &chaincfg.MainNetParams
+var deviceLibraryInitialized = false
 
 // Constants for fan and temperature bits
 const (

From c86f3976c92777c80a1497b32905d5f84555758b Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Thu, 2 Feb 2017 13:40:52 -0600
Subject: [PATCH 077/150] give an example for both path types (#136)

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5cfda06..0168b4b 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,8 @@ go build -tags opencladl
 - go get github.com/decred/gominer
   * Compilation will most likely fail which can be safely ignored for now.
 - Change to the gominer directory
-  * ```cd $GOPATH/src/github.com/decred/gominer```
+  * If using the Windows Command Prompt:
+  ```cd %GOPATH%/src/github.com/decred/gominer``` or if using git-bash ```cd $GOPATH%/src/github.com/decred/gominer```
 - Install dependencies via glide
   * ```glide install```
 

From 38c22ec27e11932bb6723d5e2ad3cb762a0c05e8 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 2 Feb 2017 17:02:18 -0500
Subject: [PATCH 078/150] Switch to standalone upstream CUDA libs.

Update all deps.

Update hash function names from dcrd.

Closes #133
---
 cudakernel_static.go  |  2 +-
 cudakernel_windows.go |  2 +-
 cudevice.go           |  2 +-
 device.go             |  4 ++--
 glide.lock            | 37 ++++++++++++++++++++-----------------
 glide.yaml            |  4 ++--
 stratum/stratum.go    |  2 +-
 7 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/cudakernel_static.go b/cudakernel_static.go
index 81f5203..f34eb71 100644
--- a/cudakernel_static.go
+++ b/cudakernel_static.go
@@ -11,7 +11,7 @@ import "C"
 import (
 	"unsafe"
 
-	"github.com/jcvernaleo/3/cuda/cu"
+	"github.com/barnex/cuda5/cu"
 )
 
 func cudaPrecomputeTable(input *[192]byte) {
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
index 4f28cb9..36af76b 100644
--- a/cudakernel_windows.go
+++ b/cudakernel_windows.go
@@ -7,7 +7,7 @@ import (
 	"syscall"
 	"unsafe"
 
-	"github.com/jcvernaleo/3/cuda/cu"
+	"github.com/barnex/cuda5/cu"
 )
 
 var (
diff --git a/cudevice.go b/cudevice.go
index 8192e57..f40ea53 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -19,7 +19,7 @@ import (
 	"time"
 	"unsafe"
 
-	"github.com/jcvernaleo/3/cuda/cu"
+	"github.com/barnex/cuda5/cu"
 
 	"github.com/decred/gominer/nvml"
 	"github.com/decred/gominer/util"
diff --git a/device.go b/device.go
index c63557a..1d89fe9 100644
--- a/device.go
+++ b/device.go
@@ -275,11 +275,11 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	binary.BigEndian.PutUint32(data[128+4*work.TimestampWord:], ts)
 	binary.BigEndian.PutUint32(data[128+4*work.Nonce0Word:], nonce0)
 	binary.BigEndian.PutUint32(data[128+4*work.Nonce1Word:], nonce1)
-	hash := chainhash.HashFuncH(data[0:180])
+	hash := chainhash.HashH(data[0:180])
 
 	// Hashes that reach this logic and fail the minimal proof of
 	// work check are considered to be hardware errors.
-	hashNum := blockchain.ShaHashToBig(&hash)
+	hashNum := blockchain.HashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
 		minrLog.Errorf("DEV #%d Hardware error found, hash %v above "+
 			"minimum target %064x", d.index, hash, d.work.Target.Bytes())
diff --git a/glide.lock b/glide.lock
index b60752d..274e5dc 100644
--- a/glide.lock
+++ b/glide.lock
@@ -1,14 +1,16 @@
-hash: a629ff7ac31d9554df88c93ed389d830fda2bb73a5d2c53b73d362dac0538e93
-updated: 2016-09-28T14:47:51.839078623-05:00
+hash: 599929d61b32132fd7d8f7df51315cf6ecf338f11164419da1038a38415729db
+updated: 2017-02-02T17:01:07.428979394-05:00
 imports:
+- name: github.com/barnex/cuda5
+  version: 57cec7ab46da74b8ca2aa1d41e898c0646081b56
+  subpackages:
+  - cu
 - name: github.com/btcsuite/btclog
-  version: f96df2375f37300305f329b8e5258764b4f19a7f
-- name: github.com/btcsuite/fastsha256
-  version: 302ad4db268b46f9ebda3078f6f7397f96047735
+  version: 73889fb79bd687870312b6e40effcecffbd57d30
 - name: github.com/btcsuite/go-flags
   version: 6c288d648c1cc1befcb90cb5511dcacf64ae8e61
 - name: github.com/btcsuite/go-socks
-  version: cfe8b59e565c1a5bd4e2005d77cd9aa8b2e14524
+  version: 4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f
   subpackages:
   - socks
 - name: github.com/btcsuite/golangcrypto
@@ -18,35 +20,36 @@ imports:
 - name: github.com/btcsuite/seelog
   version: 313961b101eb55f65ae0f03ddd4e322731763b6c
 - name: github.com/davecgh/go-spew
-  version: 5215b55f46b2b919f50a1df0eaa5886afe4e3b3d
+  version: 346938d642f2ec3594ed81d874461961cd0faa76
   subpackages:
   - spew
 - name: github.com/decred/blake256
   version: a840e32d7c31fe2e0218607334cb120a683951a4
 - name: github.com/decred/dcrd
-  version: 83110a26ab1c9c7caa2bcdac9d4e1c5fc3192e1d
+  version: 0b13650eb3178be7241814b2360200012258b411
   subpackages:
   - blockchain
+  - blockchain/internal/dbnamespace
+  - blockchain/internal/progresslog
+  - blockchain/stake
+  - blockchain/stake/internal/dbnamespace
+  - blockchain/stake/internal/ticketdb
+  - blockchain/stake/internal/tickettreap
   - chaincfg
+  - chaincfg/chainec
   - chaincfg/chainhash
-  - wire
-  - blockchain/stake
   - database
-  - txscript
-  - chaincfg/chainec
   - dcrec/edwards
   - dcrec/secp256k1
   - dcrec/secp256k1/schnorr
+  - txscript
+  - wire
 - name: github.com/decred/dcrutil
-  version: 4a3bdb1cb08b49811674750998363b8b8ccfd66e
+  version: ba0a5f399a43abc0e1a0a0442509c36f35321bce
   subpackages:
   - base58
 - name: github.com/decred/ed25519
   version: b0909d3f798b97a03c9e77023f97a5301a2a7900
   subpackages:
   - edwards25519
-- name: github.com/jcvernaleo/3
-  version: ff7ea4431a97bb2b389739cb9087bbc11faa1fe0
-  subpackages:
-  - cuda/cu
 testImports: []
diff --git a/glide.yaml b/glide.yaml
index b454f1a..4d2f0cf 100644
--- a/glide.yaml
+++ b/glide.yaml
@@ -16,6 +16,6 @@ import:
   - chaincfg/chainhash
   - wire
 - package: github.com/decred/dcrutil
-- package: github.com/jcvernaleo/3
+- package: github.com/barnex/cuda5
   subpackages:
-  - cuda/cu
+  - cu
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 53e47d8..fc68025 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -948,7 +948,7 @@ func (s *Stratum) PrepWork() error {
 func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
 	log.Debugf("Stratum got valid work to submit %x", data)
 	log.Debugf("Stratum got valid work hash %v",
-		chainhash.HashFuncH(data[0:180]))
+		chainhash.HashH(data[0:180]))
 	data2 := make([]byte, 180)
 	copy(data2, data[0:180])
 

From e967abd8ecae4b05f5fa45b8fce602008cc94a0b Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 7 Feb 2017 11:19:48 -0500
Subject: [PATCH 079/150] Bump for v0.8.0

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index 27a2ddc..072b96f 100644
--- a/version.go
+++ b/version.go
@@ -31,7 +31,7 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // versioning 2.0.0 spec (http://semver.org/).
 const (
 	appMajor uint = 0
-	appMinor uint = 7
+	appMinor uint = 8
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From 28416195c42af7ecf113b4692aaf0f9d47d8338e Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Fri, 17 Feb 2017 11:19:21 -0500
Subject: [PATCH 080/150] Handle non-int pool difficulties better.

Take care of dff between 0 and 1
---
 util/util.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/util/util.go b/util/util.go
index 6217f54..9d389bc 100644
--- a/util/util.go
+++ b/util/util.go
@@ -63,11 +63,13 @@ func DiffToTarget(diff float64, powLimit *big.Int) (*big.Int, error) {
 			"zero passed)", diff)
 	}
 
-	if math.Floor(diff) < diff {
-		return nil, fmt.Errorf("invalid pool difficulty %v (not a whole "+
-			"number)", diff)
+	// Round down in the case of a non-integer diff since we only support
+	// ints (unless diff < 1 since we don't allow 0)..
+	if diff < 1 {
+		diff = 1
+	} else {
+		diff = math.Floor(diff)
 	}
-
 	divisor := new(big.Int).SetInt64(int64(diff))
 	max := powLimit
 	target := new(big.Int)

From 945e224a80ad806654c6d4c184e1c7d635048186 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 20 Feb 2017 12:01:51 -0500
Subject: [PATCH 081/150] Add check on json to prevent panic.

make sure the unmarshaled json is not zero length so we don't panic
when trying to read it.
---
 stratum/stratum.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/stratum/stratum.go b/stratum/stratum.go
index fc68025..c45e55e 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -578,6 +578,10 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 			return nil, err
 		}
 
+		if len(resJS) == 0 {
+			return nil, errJsonType
+		}
+
 		var msgPeak []interface{}
 		err = json.Unmarshal(resJS[0], &msgPeak)
 		if err != nil {

From 2ac9e7af1a968a9dc99902e4e119f92add5c24e6 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 20 Feb 2017 12:07:23 -0500
Subject: [PATCH 082/150] Differentiate dev and release in -V

By default, put 'dev' in the prerelease field.

For release builds we can add 'release' with the build scripts.

Update comment for current (post 1.5) usage on ldflags -X while
there.
---
 version.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/version.go b/version.go
index 072b96f..3a4d8f0 100644
--- a/version.go
+++ b/version.go
@@ -40,9 +40,9 @@ const (
 )
 
 // appBuild is defined as a variable so it can be overridden during the build
-// process with '-ldflags "-X main.appBuild foo' if needed.  It MUST only
+// process with '-ldflags "-X main.appBuild=foo' if needed.  It MUST only
 // contain characters from semanticAlphabet per the semantic versioning spec.
-var appBuild string
+var appBuild = "dev"
 
 // version returns the application version as a properly formed string per the
 // semantic versioning 2.0.0 spec (http://semver.org/).

From 0f039e3f1bdeda77e360a4e0330103cca1f9e6db Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 14 Mar 2017 09:20:11 -0400
Subject: [PATCH 083/150] Add go version to version info

Following example of dcrd and dcrwallet.
---
 config.go | 3 ++-
 main.go   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/config.go b/config.go
index 242736a..283f04d 100644
--- a/config.go
+++ b/config.go
@@ -8,6 +8,7 @@ import (
 	"net"
 	"os"
 	"path/filepath"
+	"runtime"
 	"sort"
 	"strconv"
 	"strings"
@@ -271,7 +272,7 @@ func loadConfig() (*config, []string, error) {
 	}
 
 	if preCfg.ShowVersion {
-		fmt.Println(appName, gpuLib(), "version", version())
+		fmt.Printf("%s %s version %s (Go version %s)\n", appName, gpuLib(), version(), runtime.Version())
 		os.Exit(0)
 	}
 
diff --git a/main.go b/main.go
index 35773cd..a65537a 100644
--- a/main.go
+++ b/main.go
@@ -25,7 +25,8 @@ func gominerMain() error {
 	defer backendLog.Flush()
 
 	// Show version at startup.
-	mainLog.Infof("Version %s %s", version(), gpuLib())
+	mainLog.Infof("Version %s %s (Go version %s)",
+		version(), gpuLib(), runtime.Version())
 
 	// Enable http profiling server if requested.
 	if cfg.Profile != "" {

From e7b976b1ea96f88fc59a03f652367e60d512c9c9 Mon Sep 17 00:00:00 2001
From: Baggins800 <23511354@student.g.nwu.ac.za>
Date: Fri, 24 Mar 2017 16:38:45 +0200
Subject: [PATCH 084/150] Some file location updates for Linux (#147)

---
 GNUmakefile | 6 +++---
 decred.cu   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 2224e12..1469e55 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
-CC ?= gcc
-CXX ?= g++
-NVCC ?= nvcc
+CC ?= gcc -fPIC
+CXX ?= g++ -fPIC
+NVCC ?= nvcc -Xcompiler -fPIC
 AR ?= ar
 # -o is gnu only so this needs to be smarter; it does work because on darwin it
 #  fails which is also not windows.
diff --git a/decred.cu b/decred.cu
index 19a123a..d855f3b 100644
--- a/decred.cu
+++ b/decred.cu
@@ -12,7 +12,7 @@
 
 #include <stdint.h>
 #include <memory.h>
-#include <miner.h>
+#include "miner.h"
 
 #if defined(_WIN32)
 #define DLLEXPORT __declspec(dllexport)
@@ -21,7 +21,7 @@
 #endif /* _WIN32 */
 
 extern "C" {
-#include <sph/sph_blake.h>
+#include "sph/sph_blake.h"
 }
 
 /* threads per block */
@@ -42,7 +42,7 @@ extern "C" void decred_hash(void *output, const void *input)
 	sph_blake256_close(&ctx, output);
 }
 
-#include <cuda_helper.h>
+#include "cuda_helper.h"
 
 #ifdef __INTELLISENSE__
 #define __byte_perm(x, y, b) x

From 6b03e73eaf9904026bbcfafa15e18f8fa2832275 Mon Sep 17 00:00:00 2001
From: tim <timthomascode@icloud.com>
Date: Wed, 29 Mar 2017 21:27:24 -0500
Subject: [PATCH 085/150] fix GOPATH typo (#150)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0168b4b..f639985 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ go build -tags opencladl
   * Compilation will most likely fail which can be safely ignored for now.
 - Change to the gominer directory
   * If using the Windows Command Prompt:
-  ```cd %GOPATH%/src/github.com/decred/gominer``` or if using git-bash ```cd $GOPATH%/src/github.com/decred/gominer```
+  ```cd %GOPATH%/src/github.com/decred/gominer``` or if using git-bash ```cd $GOPATH/src/github.com/decred/gominer```
 - Install dependencies via glide
   * ```glide install```
 

From 4613a6e44f476c9a7ba7af8a03f389b0609003fc Mon Sep 17 00:00:00 2001
From: jolan <jolan@users.noreply.github.com>
Date: Mon, 24 Apr 2017 09:32:06 -0500
Subject: [PATCH 086/150] stratum: include stake version (#153)

---
 .gitignore         |  1 +
 stratum/stratum.go | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8951dcd..6fa35c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 *.so
 
 # Folders
+.vscode/
 _obj
 _test
 
diff --git a/stratum/stratum.go b/stratum/stratum.go
index c45e55e..5ded648 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -437,7 +437,7 @@ func (s *Stratum) Auth() error {
 		Params: []string{s.cfg.User, s.cfg.Pass},
 	}
 	// Auth reply has no method so need a way to identify it.
-	// Ugly, but not much choise.
+	// Ugly, but not much choice.
 	id, ok := msg.ID.(uint64)
 	if !ok {
 		return errJsonType
@@ -851,8 +851,11 @@ func (s *Stratum) PrepWork() error {
 		log.Error("Error decoding Coinbase pt 1.")
 		return err
 	}
-
-	// cb2 is never actually sent, so don't try to decode it.
+	cb2, err := hex.DecodeString(s.PoolWork.CB2)
+	if err != nil {
+		log.Errorf("Error decoding Coinbase pt 2.")
+		return err
+	}
 
 	// Generate current ntime.
 	ntime := time.Now().Unix() + s.PoolWork.NtimeDelta
@@ -911,6 +914,8 @@ func (s *Stratum) PrepWork() error {
 	copy(workdata[workPosition:], cb1[0:108])
 	workPosition += 108
 	copy(workdata[workPosition:], extraNonce)
+	workPosition = 176
+	copy(workdata[workPosition:], cb2)
 
 	var randomBytes = make([]byte, 4)
 	_, err = rand.Read(randomBytes)

From d2503a9d0d3533cbceac970414f6f7f457faceb3 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Mon, 24 Apr 2017 09:01:48 -0400
Subject: [PATCH 087/150] Bump for v1.0.0

---
 version.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/version.go b/version.go
index 3a4d8f0..c9c7b64 100644
--- a/version.go
+++ b/version.go
@@ -30,8 +30,8 @@ const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr
 // These constants define the application version and follow the semantic
 // versioning 2.0.0 spec (http://semver.org/).
 const (
-	appMajor uint = 0
-	appMinor uint = 8
+	appMajor uint = 1
+	appMinor uint = 0
 	appPatch uint = 0
 
 	// appPreRelease MUST only contain characters from semanticAlphabet

From d01e19d2c15d5e3fb7958c0cbef0d606d2e1ce0b Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Thu, 8 Jun 2017 14:51:31 -0400
Subject: [PATCH 088/150] travis: test against go 1.7 and 1.8

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 000b57d..39ad1a5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 language: go
 go:
-  - 1.6.3
-  - 1.7.1
+  - 1.7.x
+  - 1.8.x
 sudo: required
 dist: trusty
 before_install:

From 38778b2c22587abd9f90409ef8114b9480e48a65 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Tue, 20 Jun 2017 13:53:49 -0400
Subject: [PATCH 089/150] Swtich to new logging backend.

While there, update deps.
---
 config.go  |   6 +--
 glide.lock |  28 +++++++------
 glide.yaml |   1 -
 log.go     | 117 ++++++++++++++++++++++++++++-------------------------
 main.go    |   7 +++-
 5 files changed, 85 insertions(+), 74 deletions(-)

diff --git a/config.go b/config.go
index 283f04d..638b84d 100644
--- a/config.go
+++ b/config.go
@@ -549,9 +549,9 @@ func loadConfig() (*config, []string, error) {
 		os.Exit(0)
 	}
 
-	// Initialize logging at the default logging level.
-	initSeelogLogger(filepath.Join(cfg.LogDir, defaultLogFilename))
-	setLogLevels(defaultLogLevel)
+	// Initialize log rotation.  After log rotation has been initialized,
+	// the logger variables may be used.
+	initLogRotator(filepath.Join(cfg.LogDir, defaultLogFilename))
 
 	// Parse, validate, and set debug log level(s).
 	if err := parseAndSetDebugLevels(cfg.DebugLevel); err != nil {
diff --git a/glide.lock b/glide.lock
index 274e5dc..ef7db1d 100644
--- a/glide.lock
+++ b/glide.lock
@@ -1,24 +1,22 @@
 hash: 599929d61b32132fd7d8f7df51315cf6ecf338f11164419da1038a38415729db
-updated: 2017-02-02T17:01:07.428979394-05:00
+updated: 2017-06-20T13:48:20.402787267-04:00
 imports:
+- name: github.com/agl/ed25519
+  version: 278e1ec8e8a6e017cd07577924d6766039146ced
+  subpackages:
+  - edwards25519
 - name: github.com/barnex/cuda5
   version: 57cec7ab46da74b8ca2aa1d41e898c0646081b56
   subpackages:
   - cu
 - name: github.com/btcsuite/btclog
-  version: 73889fb79bd687870312b6e40effcecffbd57d30
+  version: 30bef3d5a6b4600e2129de8b6527ffcc1ee397ca
 - name: github.com/btcsuite/go-flags
   version: 6c288d648c1cc1befcb90cb5511dcacf64ae8e61
 - name: github.com/btcsuite/go-socks
   version: 4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f
   subpackages:
   - socks
-- name: github.com/btcsuite/golangcrypto
-  version: 53f62d9b43e87a6c56975cf862af7edf33a8d0df
-  subpackages:
-  - ripemd160
-- name: github.com/btcsuite/seelog
-  version: 313961b101eb55f65ae0f03ddd4e322731763b6c
 - name: github.com/davecgh/go-spew
   version: 346938d642f2ec3594ed81d874461961cd0faa76
   subpackages:
@@ -26,7 +24,7 @@ imports:
 - name: github.com/decred/blake256
   version: a840e32d7c31fe2e0218607334cb120a683951a4
 - name: github.com/decred/dcrd
-  version: 0b13650eb3178be7241814b2360200012258b411
+  version: ce4b77d3d9e3a4d3393e1fa3baeee5679cad518d
   subpackages:
   - blockchain
   - blockchain/internal/dbnamespace
@@ -45,11 +43,15 @@ imports:
   - txscript
   - wire
 - name: github.com/decred/dcrutil
-  version: ba0a5f399a43abc0e1a0a0442509c36f35321bce
+  version: a5fab53cab39b793142c8453caa4c6f83bc152d4
   subpackages:
   - base58
-- name: github.com/decred/ed25519
-  version: b0909d3f798b97a03c9e77023f97a5301a2a7900
+- name: github.com/jrick/logrotate
+  version: 4ed05ed86ef17d10ff99cce77481e0fcf6f2c7b0
   subpackages:
-  - edwards25519
+  - rotator
+- name: golang.org/x/crypto
+  version: adbae1b6b6fb4b02448a0fc0dbbc9ba2b95b294d
+  subpackages:
+  - ripemd160
 testImports: []
diff --git a/glide.yaml b/glide.yaml
index 4d2f0cf..c2935a7 100644
--- a/glide.yaml
+++ b/glide.yaml
@@ -5,7 +5,6 @@ import:
 - package: github.com/btcsuite/go-socks
   subpackages:
   - socks
-- package: github.com/btcsuite/seelog
 - package: github.com/davecgh/go-spew
   subpackages:
   - spew
diff --git a/log.go b/log.go
index 588dc21..c0d03bd 100644
--- a/log.go
+++ b/log.go
@@ -2,69 +2,85 @@ package main
 
 import (
 	"fmt"
+	"io"
 	"os"
-
-	"github.com/btcsuite/btclog"
-	"github.com/btcsuite/seelog"
+	"path/filepath"
 
 	"github.com/decred/gominer/stratum"
+
+	"github.com/btcsuite/btclog"
+	"github.com/jrick/logrotate/rotator"
 )
 
+// logWriter implements an io.Writer that outputs to both standard output and
+// the write-end pipe of an initialized log rotator.
+type logWriter struct{}
+
+func (logWriter) Write(p []byte) (n int, err error) {
+	os.Stdout.Write(p)
+	logRotatorPipe.Write(p)
+	return len(p), nil
+}
+
+// Loggers per subsystem.  A single backend logger is created and all subsytem
+// loggers created from it will write to the backend.  When adding new
+// subsystems, add the subsystem logger variable here and to the
+// subsystemLoggers map.
+//
+// Loggers can not be used before the log rotator has been initialized with a
+// log file.  This must be performed early during application startup by calling
+// initLogRotator.
 var (
-	backendLog = seelog.Disabled
-	mainLog    = btclog.Disabled
-	minrLog    = btclog.Disabled
-	poolLog    = btclog.Disabled
+	// backendLog is the logging backend used to create all subsystem loggers.
+	// The backend must not be used before the log rotator has been initialized,
+	// or data races and/or nil pointer dereferences will occur.
+	backendLog = btclog.NewBackend(logWriter{})
+
+	// logRotator is one of the logging outputs.  It should be closed on
+	// application shutdown.
+	logRotator *rotator.Rotator
+
+	// logRotatorPipe is the write-end pipe for writing to the log rotator.  It
+	// is written to by the Write method of the logWriter type.
+	logRotatorPipe *io.PipeWriter
+
+	mainLog = backendLog.Logger("MAIN")
+	minrLog = backendLog.Logger("MINR")
+	poolLog = backendLog.Logger("POOL")
 )
 
+// Initialize package-global logger variables.
+func init() {
+	stratum.UseLogger(poolLog)
+}
+
 var subsystemLoggers = map[string]btclog.Logger{
 	"MAIN": mainLog,
 	"MINR": minrLog,
 	"POOL": poolLog,
 }
 
-// useLogger updates the logger references for subsystemID to logger.  Invalid
-// subsystems are ignored.
-func useLogger(subsystemID string, logger btclog.Logger) {
-	if _, ok := subsystemLoggers[subsystemID]; !ok {
-		return
-	}
-	subsystemLoggers[subsystemID] = logger
-
-	switch subsystemID {
-	case "MAIN":
-		mainLog = logger
-	case "MINR":
-		minrLog = logger
-	case "POOL":
-		poolLog = logger
-		stratum.UseLogger(logger)
+// initLogRotator initializes the logging rotater to write logs to logFile and
+// create roll files in the same directory.  It must be called before the
+// package-global log rotater variables are used.
+func initLogRotator(logFile string) {
+	logDir, _ := filepath.Split(logFile)
+	err := os.MkdirAll(logDir, 0700)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "failed to create log directory: %v\n", err)
+		os.Exit(1)
 	}
-}
-
-// initSeelogLogger initializes a new seelog logger that is used as the backend
-// for all logging subsytems.
-func initSeelogLogger(logFile string) {
-	config := `
-	<seelog type="adaptive" mininterval="2000000" maxinterval="100000000"
-		critmsgcount="500" minlevel="trace">
-		<outputs formatid="all">
-			<console />
-			<rollingfile type="size" filename="%s" maxsize="10485760" maxrolls="3" />
-		</outputs>
-		<formats>
-			<format id="all" format="%%Time %%Date [%%LEV] %%Msg%%n" />
-		</formats>
-	</seelog>`
-	config = fmt.Sprintf(config, logFile)
-
-	logger, err := seelog.LoggerFromConfigAsString(config)
+	pr, pw := io.Pipe()
+	r, err := rotator.New(pr, logFile, 10*1024, false, 3)
 	if err != nil {
-		fmt.Fprintf(os.Stderr, "failed to create logger: %v", err)
+		fmt.Fprintf(os.Stderr, "failed to create file rotator: %v\n", err)
 		os.Exit(1)
 	}
 
-	backendLog = logger
+	go r.Run()
+
+	logRotator = r
+	logRotatorPipe = pw
 }
 
 // setLogLevel sets the logging level for provided subsystem.  Invalid
@@ -77,17 +93,8 @@ func setLogLevel(subsystemID string, logLevel string) {
 		return
 	}
 
-	// Default to info if the log level is invalid.
-	level, ok := btclog.LogLevelFromString(logLevel)
-	if !ok {
-		level = btclog.InfoLvl
-	}
-
-	// Create new logger for the subsystem if needed.
-	if logger == btclog.Disabled {
-		logger = btclog.NewSubsystemLogger(backendLog, subsystemID+": ")
-		useLogger(subsystemID, logger)
-	}
+	// Defaults to info if the log level is invalid.
+	level, _ := btclog.LevelFromString(logLevel)
 	logger.SetLevel(level)
 }
 
diff --git a/main.go b/main.go
index a65537a..572f987 100644
--- a/main.go
+++ b/main.go
@@ -22,7 +22,11 @@ func gominerMain() error {
 		return err
 	}
 	cfg = tcfg
-	defer backendLog.Flush()
+	defer func() {
+		if logRotator != nil {
+			logRotator.Close()
+		}
+	}()
 
 	// Show version at startup.
 	mainLog.Infof("Version %s %s (Go version %s)",
@@ -40,7 +44,6 @@ func gominerMain() error {
 			err := http.ListenAndServe(listenAddr, nil)
 			if err != nil {
 				mainLog.Errorf("Unable to create profiler: %v", err)
-				backendLog.Flush()
 				os.Exit(1)
 			}
 		}()

From b9636c647ea20b2fd8d4af56ccf4d82ea1eb7cd7 Mon Sep 17 00:00:00 2001
From: "John C. Vernaleo" <john@netpurgatory.com>
Date: Thu, 13 Jul 2017 09:53:56 -0400
Subject: [PATCH 090/150] Check valid log level using btclog.

This matches the change in dcrd for the same thing.
---
 config.go | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/config.go b/config.go
index 638b84d..835bd7a 100644
--- a/config.go
+++ b/config.go
@@ -14,6 +14,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/btcsuite/btclog"
 	"github.com/btcsuite/go-flags"
 	"github.com/decred/dcrutil"
 )
@@ -123,21 +124,8 @@ func fileExists(name string) bool {
 
 // validLogLevel returns whether or not logLevel is a valid debug log level.
 func validLogLevel(logLevel string) bool {
-	switch logLevel {
-	case "trace":
-		fallthrough
-	case "debug":
-		fallthrough
-	case "info":
-		fallthrough
-	case "warn":
-		fallthrough
-	case "error":
-		fallthrough
-	case "critical":
-		return true
-	}
-	return false
+	_, ok := btclog.LevelFromString(logLevel)
+	return ok
 }
 
 // supportedSubsystems returns a sorted slice of the supported subsystems for

From e7a32e7696d534e7022dc44e767e8c9bedf85ec9 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Fri, 29 Sep 2017 12:29:55 -0400
Subject: [PATCH 091/150] Drop glide, use dep. (#166)

* Drop glide, use dep.

* travis: test against go 1.9
---
 .travis.yml |  6 ++---
 Gopkg.lock  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 Gopkg.toml  | 28 +++++++++++++++++++++
 glide.lock  | 57 ------------------------------------------
 glide.yaml  | 20 ---------------
 goclean.sh  |  7 ++----
 6 files changed, 105 insertions(+), 85 deletions(-)
 create mode 100644 Gopkg.lock
 create mode 100644 Gopkg.toml
 delete mode 100644 glide.lock
 delete mode 100644 glide.yaml

diff --git a/.travis.yml b/.travis.yml
index 39ad1a5..9d5d192 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,15 +1,15 @@
 language: go
 go:
-  - 1.7.x
   - 1.8.x
+  - 1.9.x
 sudo: required
 dist: trusty
 before_install:
   - sudo apt-get update
   - sudo apt-get install opencl-headers nvidia-opencl-dev
 install:
-  - go get -v github.com/Masterminds/glide
-  - glide install
+  - go get -v github.com/golang/dep/cmd/dep
+  - dep ensure
   - go get -v github.com/alecthomas/gometalinter
   - gometalinter --install
 script:
diff --git a/Gopkg.lock b/Gopkg.lock
new file mode 100644
index 0000000..8e57778
--- /dev/null
+++ b/Gopkg.lock
@@ -0,0 +1,72 @@
+# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
+
+
+[[projects]]
+  name = "github.com/agl/ed25519"
+  packages = [".","edwards25519"]
+  revision = "278e1ec8e8a6e017cd07577924d6766039146ced"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/barnex/cuda5"
+  packages = ["cu"]
+  revision = "57cec7ab46da74b8ca2aa1d41e898c0646081b56"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/btcsuite/btclog"
+  packages = ["."]
+  revision = "84c8d2346e9fc8c7b947e243b9c24e6df9fd206a"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/btcsuite/go-flags"
+  packages = ["."]
+  revision = "6c288d648c1cc1befcb90cb5511dcacf64ae8e61"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/btcsuite/go-socks"
+  packages = ["socks"]
+  revision = "4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/davecgh/go-spew"
+  packages = ["spew"]
+  revision = "a476722483882dd40b8111f0eb64e1d7f43f56e4"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/dchest/blake256"
+  packages = ["."]
+  revision = "dee3fe6eb0e98dc774a94fc231f85baf7c29d360"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/decred/dcrd"
+  packages = ["blockchain","blockchain/internal/dbnamespace","blockchain/internal/progresslog","blockchain/stake","blockchain/stake/internal/dbnamespace","blockchain/stake/internal/ticketdb","blockchain/stake/internal/tickettreap","chaincfg","chaincfg/chainec","chaincfg/chainhash","database","dcrec/edwards","dcrec/secp256k1","dcrec/secp256k1/schnorr","txscript","wire"]
+  revision = "f11fac8134626768ef7b1f278fd4331f004bfb49"
+
+[[projects]]
+  branch = "master"
+  name = "github.com/decred/dcrutil"
+  packages = [".","base58"]
+  revision = "ddbde93f65ab0692e54ed8a5ad325fa2e8af4daa"
+
+[[projects]]
+  name = "github.com/jrick/logrotate"
+  packages = ["rotator"]
+  revision = "4ed05ed86ef17d10ff99cce77481e0fcf6f2c7b0"
+
+[[projects]]
+  name = "golang.org/x/crypto"
+  packages = ["ripemd160"]
+  revision = "adbae1b6b6fb4b02448a0fc0dbbc9ba2b95b294d"
+
+[solve-meta]
+  analyzer-name = "dep"
+  analyzer-version = 1
+  inputs-digest = "4919571b7c7b0f4116ae50ef832452943d99856b0f19a2577f89d4033e305825"
+  solver-name = "gps-cdcl"
+  solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
new file mode 100644
index 0000000..11d23a6
--- /dev/null
+++ b/Gopkg.toml
@@ -0,0 +1,28 @@
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/barnex/cuda5"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/btcsuite/btclog"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/btcsuite/go-flags"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/btcsuite/go-socks"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/davecgh/go-spew"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/decred/dcrd"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/decred/dcrutil"
diff --git a/glide.lock b/glide.lock
deleted file mode 100644
index ef7db1d..0000000
--- a/glide.lock
+++ /dev/null
@@ -1,57 +0,0 @@
-hash: 599929d61b32132fd7d8f7df51315cf6ecf338f11164419da1038a38415729db
-updated: 2017-06-20T13:48:20.402787267-04:00
-imports:
-- name: github.com/agl/ed25519
-  version: 278e1ec8e8a6e017cd07577924d6766039146ced
-  subpackages:
-  - edwards25519
-- name: github.com/barnex/cuda5
-  version: 57cec7ab46da74b8ca2aa1d41e898c0646081b56
-  subpackages:
-  - cu
-- name: github.com/btcsuite/btclog
-  version: 30bef3d5a6b4600e2129de8b6527ffcc1ee397ca
-- name: github.com/btcsuite/go-flags
-  version: 6c288d648c1cc1befcb90cb5511dcacf64ae8e61
-- name: github.com/btcsuite/go-socks
-  version: 4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f
-  subpackages:
-  - socks
-- name: github.com/davecgh/go-spew
-  version: 346938d642f2ec3594ed81d874461961cd0faa76
-  subpackages:
-  - spew
-- name: github.com/decred/blake256
-  version: a840e32d7c31fe2e0218607334cb120a683951a4
-- name: github.com/decred/dcrd
-  version: ce4b77d3d9e3a4d3393e1fa3baeee5679cad518d
-  subpackages:
-  - blockchain
-  - blockchain/internal/dbnamespace
-  - blockchain/internal/progresslog
-  - blockchain/stake
-  - blockchain/stake/internal/dbnamespace
-  - blockchain/stake/internal/ticketdb
-  - blockchain/stake/internal/tickettreap
-  - chaincfg
-  - chaincfg/chainec
-  - chaincfg/chainhash
-  - database
-  - dcrec/edwards
-  - dcrec/secp256k1
-  - dcrec/secp256k1/schnorr
-  - txscript
-  - wire
-- name: github.com/decred/dcrutil
-  version: a5fab53cab39b793142c8453caa4c6f83bc152d4
-  subpackages:
-  - base58
-- name: github.com/jrick/logrotate
-  version: 4ed05ed86ef17d10ff99cce77481e0fcf6f2c7b0
-  subpackages:
-  - rotator
-- name: golang.org/x/crypto
-  version: adbae1b6b6fb4b02448a0fc0dbbc9ba2b95b294d
-  subpackages:
-  - ripemd160
-testImports: []
diff --git a/glide.yaml b/glide.yaml
deleted file mode 100644
index c2935a7..0000000
--- a/glide.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-package: github.com/decred/gominer
-import:
-- package: github.com/btcsuite/btclog
-- package: github.com/btcsuite/go-flags
-- package: github.com/btcsuite/go-socks
-  subpackages:
-  - socks
-- package: github.com/davecgh/go-spew
-  subpackages:
-  - spew
-- package: github.com/decred/dcrd
-  subpackages:
-  - blockchain
-  - chaincfg
-  - chaincfg/chainhash
-  - wire
-- package: github.com/decred/dcrutil
-- package: github.com/barnex/cuda5
-  subpackages:
-  - cu
diff --git a/goclean.sh b/goclean.sh
index c501dcc..bca23bb 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -10,11 +10,8 @@
 set -ex
 
 # Automatic checks
-test -z "$(gometalinter --disable-all \
+test -z "$(gometalinter --vendor --disable-all \
 --enable=gofmt \
 --enable=vet \
 --enable=goimports \
---deadline=45s $(glide novendor) | tee /dev/stderr)"
-test -z "$(go fmt $(glide novendor) | tee /dev/stderr)"
-test -z "$(go vet $(glide novendor) 2>&1 | tee /dev/stderr)"
-
+--deadline=10m ./... | tee /dev/stderr)"

From 47d953e95530c1c7858084e16e4552d882d78027 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Fri, 29 Sep 2017 14:26:58 -0400
Subject: [PATCH 092/150] travis: enable ineffassign (#167)

---
 adl/adl.go         |  8 ++------
 cladldevice.go     |  7 ++-----
 cldevice.go        |  7 ++-----
 cudevice.go        |  4 ++--
 device.go          | 34 +++++++++++++++++-----------------
 goclean.sh         |  2 ++
 stratum/stratum.go |  1 -
 7 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/adl/adl.go b/adl/adl.go
index e40cb70..c0695fb 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -27,9 +27,7 @@ func Release() {
 
 // DeviceFanGetPercent fetches and returns fan utilization for a device index
 func DeviceFanGetPercent(index int) uint32 {
-	fanPercent := uint32(0)
-
-	fanPercent = uint32(C.getADLFanPercent(C.int(index)))
+	fanPercent := uint32(C.getADLFanPercent(C.int(index)))
 
 	return fanPercent
 }
@@ -42,9 +40,7 @@ func DeviceFanSetPercent(index int, fanPercent uint32) int {
 
 // DeviceTemperature fetches and returns temperature for a device index
 func DeviceTemperature(index int) uint32 {
-	temperature := uint32(0)
-
-	temperature = uint32(C.getADLTemp(C.int(index)))
+	temperature := uint32(C.getADLTemp(C.int(index)))
 
 	return temperature
 }
diff --git a/cladldevice.go b/cladldevice.go
index 1218d14..238826c 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -135,11 +135,8 @@ type Device struct {
 }
 
 func deviceStats(index int) (uint32, uint32) {
-	fanPercent := uint32(0)
-	temperature := uint32(0)
-
-	fanPercent = adl.DeviceFanGetPercent(index)
-	temperature = adl.DeviceTemperature(index) / AMDTempDivisor
+	fanPercent := adl.DeviceFanGetPercent(index)
+	temperature := adl.DeviceTemperature(index) / AMDTempDivisor
 
 	return fanPercent, temperature
 }
diff --git a/cldevice.go b/cldevice.go
index 6c6497e..e5e739c 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -282,13 +282,10 @@ func determineDeviceKind(index int, deviceType string) string {
 }
 
 func deviceStats(index int) (uint32, uint32) {
-	fanPercent := uint32(0)
-	temperature := uint32(0)
-
-	fanPercent = deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "fan"))
+	fanPercent := deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "fan"))
 	fanPercentFloat := float64(fanPercent) / float64(AMDGPUFanMax) * float64(100)
 	fanPercent = uint32(fanPercentFloat)
-	temperature = deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "temp")) / AMDTempDivisor
+	temperature := deviceStatsReadSysfsEntry(amdgpuGetSysfsPath(index, "temp")) / AMDTempDivisor
 
 	return fanPercent, temperature
 }
diff --git a/cudevice.go b/cudevice.go
index f40ea53..031b110 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -347,9 +347,9 @@ func (d *Device) runDevice() error {
 		startNonce := d.lastBlock[work.Nonce1Word]
 
 		throughput := uint32(0x20000000)
-		gridx := ((throughput - 1) / 640)
+		//gridx := ((throughput - 1) / 640)
 
-		gridx = 52428 // like ccminer
+		gridx := uint32(52428) // like ccminer
 
 		targetHigh := ^uint32(0)
 
diff --git a/device.go b/device.go
index 1d89fe9..3d04a29 100644
--- a/device.go
+++ b/device.go
@@ -103,19 +103,19 @@ func (d *Device) Run() {
 func (d *Device) fanControl() {
 	d.Lock()
 	defer d.Unlock()
-	fanChange := 0
-	fanChangeLevel := ""
-	fanIntent := ""
+	var fanChangeLevel, fanIntent string
+	var fanChange uint32
 	fanLast := d.fanControlLastFanPercent
-	tempChange := 0
-	tempChangeLevel := ""
-	tempDirection := ""
+
+	var tempChange uint32
+	var tempChangeLevel, tempDirection string
+	var tempSeverity, tempTargetType string
+
+	var firstRun bool
+
 	tempLast := d.fanControlLastTemp
 	tempMinAllowed := d.tempTarget - FanControlHysteresis
 	tempMaxAllowed := d.tempTarget + FanControlHysteresis
-	tempSeverity := ""
-	tempTargetType := ""
-	firstRun := false
 
 	// Save the values we read for the next time the loop is run
 	fanCur := atomic.LoadUint32(&d.fanPercent)
@@ -160,12 +160,12 @@ func (d *Device) fanControl() {
 
 	// we increased the fan to lower the device temperature last time
 	if fanLast < fanCur {
-		fanChange = int(fanCur) - int(fanLast)
+		fanChange = fanCur - fanLast
 		fanIntent = TargetHigher
 	}
 	// we decreased the fan to raise the device temperature last time
 	if fanLast > fanCur {
-		fanChange = int(fanLast) - int(fanCur)
+		fanChange = fanLast - fanCur
 		fanIntent = TargetLower
 	}
 	// we didn't make any changes
@@ -175,16 +175,16 @@ func (d *Device) fanControl() {
 
 	if fanChange == 0 {
 		fanChangeLevel = ChangeLevelNone
-	} else if fanChange == int(FanControlAdjustmentSmall) {
+	} else if fanChange == FanControlAdjustmentSmall {
 		fanChangeLevel = ChangeLevelSmall
-	} else if fanChange == int(FanControlAdjustmentLarge) {
+	} else if fanChange == FanControlAdjustmentLarge {
 		fanChangeLevel = ChangeLevelLarge
 	} else {
 		// XXX Seems the AMDGPU driver may not support all values or
 		// changes values underneath us
 		minrLog.Tracef("DEV #%d fan changed by an unexpected value %v", d.index,
 			fanChange)
-		if fanChange < int(FanControlAdjustmentSmall) {
+		if fanChange < FanControlAdjustmentSmall {
 			fanChangeLevel = ChangeLevelSmall
 		} else {
 			fanChangeLevel = ChangeLevelLarge
@@ -192,11 +192,11 @@ func (d *Device) fanControl() {
 	}
 
 	if tempLast < tempCur {
-		tempChange = int(tempCur) - int(tempLast)
+		tempChange = tempCur - tempLast
 		tempDirection = "Up"
 	}
 	if tempLast > tempCur {
-		tempChange = int(tempLast) - int(tempCur)
+		tempChange = tempLast - tempCur
 		tempDirection = "Down"
 	}
 	if tempLast == tempCur {
@@ -205,7 +205,7 @@ func (d *Device) fanControl() {
 
 	if tempChange == 0 {
 		tempChangeLevel = ChangeLevelNone
-	} else if tempChange > int(FanControlHysteresis) {
+	} else if tempChange > FanControlHysteresis {
 		tempChangeLevel = ChangeLevelLarge
 	} else {
 		tempChangeLevel = ChangeLevelSmall
diff --git a/goclean.sh b/goclean.sh
index bca23bb..039e004 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -3,6 +3,7 @@
 # 1. gofmt         (http://golang.org/cmd/gofmt/)
 # 2. go vet        (http://golang.org/cmd/vet)
 # 3. goimports     (https://github.com/bradfitz/goimports)
+# 4. ineffassign   (https://github.com/gordonklaus/ineffassign)
 
 # gometalinter (github.com/alecthomas/gometalinter) is used to run each each
 # static checker.
@@ -14,4 +15,5 @@ test -z "$(gometalinter --vendor --disable-all \
 --enable=gofmt \
 --enable=vet \
 --enable=goimports \
+--enable=ineffassign \
 --deadline=10m ./... | tee /dev/stderr)"
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 5ded648..bff1fdf 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -923,7 +923,6 @@ func (s *Stratum) PrepWork() error {
 		log.Errorf("Unable to generate random bytes")
 		return err
 	}
-	workPosition += 4
 
 	var workData [192]byte
 	copy(workData[:], workdata[:])

From 44ea5a25608b9b3312210ac764cd0a1eccc2edff Mon Sep 17 00:00:00 2001
From: Gabriel Oliveira <gabrielboliveira@users.noreply.github.com>
Date: Mon, 2 Oct 2017 12:53:03 -0300
Subject: [PATCH 093/150] Implement Remote Status API (#165)

---
 README.md           | 107 ++++++++++++++++++++++++++++++++-----------
 config.go           |  75 ++++++++++++++++++++++--------
 device.go           |  21 ++++++---
 main.go             |   4 ++
 miner.go            |  30 ++++++++----
 monitor.go          | 108 ++++++++++++++++++++++++++++++++++++++++++++
 sample-gominer.conf |  33 ++++++++++++++
 stratum/stratum.go  |   8 ++++
 8 files changed, 325 insertions(+), 61 deletions(-)
 create mode 100644 monitor.go

diff --git a/README.md b/README.md
index f639985..e9d484b 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,53 @@ Stratum/pool mining:
 gominer -o stratum+tcp://pool:port -m username -n password
 ```
 
-## Linux Build Pre-Requisites
+## Status API
+
+There is a built-in status API to report miner information. You can set an
+address and port with `--apilisten`. There are configuration examples on
+[sample-gominer.conf](sample-gominer.conf). If no port is specified, then it
+will listen by default on `3333`.
+
+Example usage:
+
+```sh
+$ gominer --apilisten="localhost"
+```
+
+Example output:
+
+```sh
+$ curl http://localhost:3333/
+> {
+    "validShares": 0,
+    "staleShares": 0,
+    "invalidShares": 0,
+    "totalShares": 0,
+    "sharesPerMinute": 0,
+    "started": 1504453881,
+    "uptime": 6,
+    "devices": [{
+        "index": 2,
+        "deviceName": "GeForce GT 750M",
+        "deviceType": "GPU",
+        "hashRate": 110127366.53846154,
+        "hashRateFormatted": "110MH/s",
+        "fanPercent": 0,
+        "temperature": 0,
+        "started": 1504453880
+    }],
+    "pool": {
+        "started": 1504453881,
+        "uptime": 6
+    }
+}
+```
+
+## Building
+
+### Linux
+
+#### Pre-Requisites
 
 You will either need to install CUDA for NVIDIA graphics cards or OpenCL
 library/headers that support your device such as: AMDGPU-PRO (for newer AMD
@@ -47,7 +93,7 @@ gominer has been built successfully on Ubuntu 16.04 with go1.6.2, go1.7.1,
 g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
 well.
 
-## Linux Build Instructions
+#### Instructions
 
 To download and build gominer, run:
 
@@ -75,41 +121,59 @@ For OpenCL with AMD Device Library (ADL) support:
 go build -tags opencladl
 ```
 
-## Windows Build Pre-Requisites
+### Windows
+
+#### Pre-Requisites
 
 - Download and install the official Go Windows binaries from [https://golang.dl/](https://golang.org/dl/)
 - Download and install Git for Windows from [https://git-for-windows.github.io/](https://git-for-windows.github.io/)
   * Make sure to select the Git-Bash option when prompted
 - Download the MinGW-w64 installer from [https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win32/Personal Builds/mingw-builds/installer/](https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/)
   * Select the x64 toolchain and use defaults for the other questions
-- Set the environment variable GOPATH to C:\Users\username\go
+- Set the environment variable GOPATH to `C:\Users\username\go`
 - Check that the GOROOT environment variable is set to C:\Go
   * This should have been done by the Go installer
-- Add the following locations to your PATH C:\Users\username\go\bin;C:\Go\bin
-- Add C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin to your PATH (This is the latest release as of 2016-09-29)
-- go get github.com/Masterminds/glide
+- Add the following locations to your PATH: `C:\Users\username\go\bin;C:\Go\bin`
+- Add `C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin` to your PATH (This is the latest release as of 2016-09-29)
+- `go get github.com/Masterminds/glide`
   * You should be able to type ```glide``` and get glide's usage display.  If not, double check the steps above
-- go get github.com/decred/gominer
+- `go get github.com/decred/gominer`
   * Compilation will most likely fail which can be safely ignored for now.
 - Change to the gominer directory
   * If using the Windows Command Prompt:
-  ```cd %GOPATH%/src/github.com/decred/gominer``` or if using git-bash ```cd $GOPATH/src/github.com/decred/gominer```
+  ```cd %GOPATH%/src/github.com/decred/gominer```
+  * If using git-bash
+  ```cd $GOPATH/src/github.com/decred/gominer```
 - Install dependencies via glide
   * ```glide install```
 
-### CUDA Specific Steps
+#### Build Instructions
+
+##### CUDA
+
+###### Pre-Requisites
 
 - Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)
-- Add C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin to your PATH
+- Add `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin` to your PATH
 - Install CUDA 7.0 from [https://developer.nvidia.com/cuda-toolkit-70](https://developer.nvidia.com/cuda-toolkit-70)
-- Add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\bin to your PATH
+- Add `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\bin` to your PATH
+
+###### Steps
+- Using git-bash:
+  * ```cd $GOPATH/src/github.com/decred/gominer```
+  * ```mingw32-make.exe```
+- Copy dependencies:
+  * ```copy obj/decred.dll .```
+  * ```copy nvidia/NVSMI/nvml.dll .```
 
-### OpenCL/ADL Specific Steps
+##### OpenCL/ADL
+
+###### Pre-Requisites
 
 - Download AMD APP SDK v3.0 from [http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
   * Samples may be unselected from the install to save space as only the libraries and headers are needed
-- Copy or Move C:\Program Files (x86)\AMD APP SDK\3.0 to C:\appsdk
-  * Ensure the folders C:\appsdk\include and C:\appsdk\lib are populated
+- Copy or Move `C:\Program Files (x86)\AMD APP SDK\3.0` to `C:\appsdk`
+  * Ensure the folders `C:\appsdk\include` and `C:\appsdk\lib` are populated
 - Change to the library directory C:\appsdk\lib\x86_64
   * ```cd C:\appsdk\lib\x86_64```
 - Copy and prepare the ADL library for linking
@@ -117,18 +181,7 @@ go build -tags opencladl
   * ```gendef atiadlxx.dll```
   * ```dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def```
 
-## Windows Build Instructions
-
-### CUDA
-
-- Using git-bash:
-  * ```cd $GOPATH/src/github.com/decred/gominer```
-  * ```mingw32-make.exe```
-- Copy dependencies:
-  * ```copy obj/decred.dll .```
-  * ```copy nvidia/NVSMI/nvml.dll .```
-
-### OpenCL / OpenCL w/ADL support
+###### Steps
 
 - For OpenCL:
   * ```go build -tags opencl```
diff --git a/config.go b/config.go
index 835bd7a..de29d43 100644
--- a/config.go
+++ b/config.go
@@ -28,13 +28,18 @@ const (
 )
 
 var (
-	minerHomeDir         = dcrutil.AppDataDir("gominer", false)
-	dcrdHomeDir          = dcrutil.AppDataDir("dcrd", false)
-	defaultConfigFile    = filepath.Join(minerHomeDir, defaultConfigFilename)
-	defaultRPCServer     = "localhost"
-	defaultRPCCertFile   = filepath.Join(dcrdHomeDir, "rpc.cert")
-	defaultLogDir        = filepath.Join(minerHomeDir, defaultLogDirname)
-	defaultAutocalibrate = 500
+	minerHomeDir          = dcrutil.AppDataDir("gominer", false)
+	dcrdHomeDir           = dcrutil.AppDataDir("dcrd", false)
+	defaultConfigFile     = filepath.Join(minerHomeDir, defaultConfigFilename)
+	defaultRPCServer      = "localhost"
+	defaultRPCCertFile    = filepath.Join(dcrdHomeDir, "rpc.cert")
+	defaultRPCPortMainNet = "9109"
+	defaultRPCPortTestNet = "19109"
+	defaultRPCPortSimNet  = "19556"
+	defaultAPIHost        = "localhost"
+	defaultAPIPort        = "3333"
+	defaultLogDir         = filepath.Join(minerHomeDir, defaultLogDirname)
+	defaultAutocalibrate  = 500
 
 	minIntensity  = 8
 	maxIntensity  = 31
@@ -59,6 +64,9 @@ type config struct {
 	CPUProfile string `long:"cpuprofile" description:"Write CPU profile to the specified file"`
 	MemProfile string `long:"memprofile" description:"Write mem profile to the specified file"`
 
+	// Status API options
+	APIListeners []string `long:"apilisten" description:"Add an interface/port to expose miner status API"`
+
 	// RPC connection options
 	RPCUser     string `short:"u" long:"rpcuser" description:"RPC username"`
 	RPCPassword string `short:"P" long:"rpcpass" default-mask:"-" description:"RPC password"`
@@ -92,26 +100,40 @@ type config struct {
 	PoolPassword string `short:"n" long:"poolpass" default-mask:"-" description:"Pool password"`
 }
 
+// removeDuplicateAddresses returns a new slice with all duplicate entries in
+// addrs removed.
+func removeDuplicateAddresses(addrs []string) []string {
+	result := make([]string, 0, len(addrs))
+	seen := map[string]struct{}{}
+	for _, val := range addrs {
+		if _, ok := seen[val]; !ok {
+			result = append(result, val)
+			seen[val] = struct{}{}
+		}
+	}
+	return result
+}
+
 // normalizeAddress returns addr with the passed default port appended if
 // there is not already a port specified.
-func normalizeAddress(addr string, useTestNet, useSimNet bool) string {
+func normalizeAddress(addr string, defaultPort string) string {
 	_, _, err := net.SplitHostPort(addr)
 	if err != nil {
-		var defaultPort string
-		switch {
-		case useTestNet:
-			defaultPort = "19109"
-		case useSimNet:
-			defaultPort = "19556"
-		default:
-			defaultPort = "9109"
-		}
-
 		return net.JoinHostPort(addr, defaultPort)
 	}
 	return addr
 }
 
+// normalizeAddresses returns a new slice with all the passed peer addresses
+// normalized with the given default port, and all duplicates removed.
+func normalizeAddresses(addrs []string, defaultPort string) []string {
+	for i, addr := range addrs {
+		addrs[i] = normalizeAddress(addr, defaultPort)
+	}
+
+	return removeDuplicateAddresses(addrs)
+}
+
 // filesExists reports whether the named file or directory exists.
 func fileExists(name string) bool {
 	if _, err := os.Stat(name); err != nil {
@@ -549,13 +571,26 @@ func loadConfig() (*config, []string, error) {
 		return nil, nil, err
 	}
 
+	if len(cfg.APIListeners) != 0 {
+		cfg.APIListeners = normalizeAddresses(cfg.APIListeners, defaultAPIPort)
+	}
+
 	// Handle environment variable expansion in the RPC certificate path.
 	cfg.RPCCert = cleanAndExpandPath(cfg.RPCCert)
 
+	var defaultRPCPort string
+	switch {
+	case cfg.TestNet:
+		defaultRPCPort = defaultRPCPortTestNet
+	case cfg.SimNet:
+		defaultRPCPort = defaultRPCPortSimNet
+	default:
+		defaultRPCPort = defaultRPCPortMainNet
+	}
+
 	// Add default port to RPC server based on --testnet flag
 	// if needed.
-	cfg.RPCServer = normalizeAddress(cfg.RPCServer, cfg.TestNet,
-		cfg.SimNet)
+	cfg.RPCServer = normalizeAddress(cfg.RPCServer, defaultRPCPort)
 
 	// Warn about missing config file only after all other configuration is
 	// done.  This prevents the warning on help messages and invalid
diff --git a/device.go b/device.go
index 3d04a29..c3f2644 100644
--- a/device.go
+++ b/device.go
@@ -317,15 +317,10 @@ func (d *Device) PrintStats() {
 		return
 	}
 
-	diffOneShareHashesAvg := uint64(0x00000000FFFFFFFF)
 	d.Lock()
 	defer d.Unlock()
-	averageHashRate := (float64(diffOneShareHashesAvg) *
-		float64(d.allDiffOneShares)) /
-		float64(secondsElapsed)
 
-	fanPercent := atomic.LoadUint32(&d.fanPercent)
-	temperature := atomic.LoadUint32(&d.temperature)
+	averageHashRate, fanPercent, temperature := d.Status()
 
 	if fanPercent != 0 || temperature != 0 {
 		minrLog.Infof("DEV #%d (%s) %v Fan=%v%% T=%vC",
@@ -360,3 +355,17 @@ func (d *Device) UpdateFanTemp() {
 		}
 	}
 }
+
+func (d *Device) Status() (float64, uint32, uint32) {
+	secondsElapsed := uint32(time.Now().Unix()) - d.started
+	diffOneShareHashesAvg := uint64(0x00000000FFFFFFFF)
+
+	averageHashRate := (float64(diffOneShareHashesAvg) *
+		float64(d.allDiffOneShares)) /
+		float64(secondsElapsed)
+
+	fanPercent := atomic.LoadUint32(&d.fanPercent)
+	temperature := atomic.LoadUint32(&d.temperature)
+
+	return averageHashRate, fanPercent, temperature
+}
diff --git a/main.go b/main.go
index 572f987..931a24e 100644
--- a/main.go
+++ b/main.go
@@ -82,6 +82,10 @@ func gominerMain() error {
 		return err
 	}
 
+	if len(cfg.APIListeners) != 0 {
+		go RunMonitor(m)
+	}
+
 	c := make(chan os.Signal, 1)
 	signal.Notify(c, os.Interrupt)
 	go func() {
diff --git a/miner.go b/miner.go
index 2724753..b64a100 100644
--- a/miner.go
+++ b/miner.go
@@ -158,11 +158,9 @@ func (m *Miner) printStatsThread() {
 
 	for {
 		if !cfg.Benchmark {
+			valid, rejected, stale, total, utility := m.Status()
+
 			if cfg.Pool != "" {
-				valid := atomic.LoadUint64(&m.pool.ValidShares)
-				rejected := atomic.LoadUint64(&m.pool.InvalidShares)
-				stale := atomic.LoadUint64(&m.staleShares)
-				total := valid + rejected + stale
 				minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Stale: %v, Total: %v",
 					valid,
 					rejected,
@@ -171,13 +169,9 @@ func (m *Miner) printStatsThread() {
 				)
 				secondsElapsed := uint32(time.Now().Unix()) - m.started
 				if (secondsElapsed / 60) > 0 {
-					utility := float64(valid) / (float64(secondsElapsed) / float64(60))
 					minrLog.Infof("Global utility (accepted shares/min): %v", utility)
 				}
 			} else {
-				valid := atomic.LoadUint64(&m.validShares)
-				rejected := atomic.LoadUint64(&m.invalidShares)
-				total := valid + rejected
 				minrLog.Infof("Global stats: Accepted: %v, Rejected: %v, Total: %v",
 					valid,
 					rejected,
@@ -241,3 +235,23 @@ func (m *Miner) Stop() {
 		d.Stop()
 	}
 }
+
+func (m *Miner) Status() (uint64, uint64, uint64, uint64, float64) {
+	if cfg.Pool != "" {
+		valid := atomic.LoadUint64(&m.pool.ValidShares)
+		rejected := atomic.LoadUint64(&m.pool.InvalidShares)
+		stale := atomic.LoadUint64(&m.staleShares)
+		total := valid + rejected + stale
+
+		secondsElapsed := uint32(time.Now().Unix()) - m.started
+		utility := float64(valid) / (float64(secondsElapsed) / float64(60))
+
+		return valid, rejected, stale, total, utility
+	}
+
+	valid := atomic.LoadUint64(&m.validShares)
+	rejected := atomic.LoadUint64(&m.invalidShares)
+	total := valid + rejected
+
+	return valid, rejected, 0, total, 0
+}
diff --git a/monitor.go b/monitor.go
new file mode 100644
index 0000000..0700ac4
--- /dev/null
+++ b/monitor.go
@@ -0,0 +1,108 @@
+package main
+
+import (
+	"encoding/json"
+	"net/http"
+	"time"
+
+	"github.com/decred/gominer/util"
+)
+
+type MinerStatus struct {
+	ValidShares     uint64  `json:"validShares"`
+	StaleShares     uint64  `json:"staleShares"`
+	InvalidShares   uint64  `json:"invalidShares"`
+	TotalShares     uint64  `json:"totalShares"`
+	SharesPerMinute float64 `json:"sharesPerMinute"`
+	Started         uint32  `json:"started"`
+	Uptime          uint32  `json:"uptime"`
+
+	Devices []*DeviceStatus `json:"devices"`
+	Pool    *PoolStatus     `json:"pool,omitempty"`
+}
+
+type DeviceStatus struct {
+	Index      int    `json:"index"`
+	DeviceName string `json:"deviceName"`
+	DeviceType string `json:"deviceType"`
+
+	HashRate          float64 `json:"hashRate"`
+	HashRateFormatted string  `json:"hashRateFormatted"`
+
+	FanPercent  uint32 `json:"fanPercent"`
+	Temperature uint32 `json:"temperature"`
+
+	Started uint32 `json:"started"`
+}
+
+type PoolStatus struct {
+	Started uint32 `json:"started"`
+	Uptime  uint32 `json:"uptime"`
+}
+
+var (
+	m *Miner
+)
+
+func RunMonitor(tm *Miner) {
+	m = tm
+
+	if len(cfg.APIListeners) != 0 {
+		http.HandleFunc("/", getMinerStatus)
+
+		for _, addr := range cfg.APIListeners {
+			err := http.ListenAndServe(addr, nil)
+
+			if err != nil {
+				mainLog.Warnf("Unable to create monitor: %v", err)
+				return
+			}
+		}
+	}
+}
+
+func getMinerStatus(w http.ResponseWriter, req *http.Request) {
+	ms := &MinerStatus{
+		Started: m.started,
+		Uptime:  uint32(time.Now().Unix()) - m.started,
+	}
+
+	if !cfg.Benchmark {
+		valid, invalid, stale, total, sharesPerMinute := m.Status()
+
+		ms.ValidShares = valid
+		ms.InvalidShares = invalid
+		ms.StaleShares = stale
+		ms.TotalShares = total
+		ms.SharesPerMinute = sharesPerMinute
+
+		if cfg.Pool != "" {
+			ms.Pool = &PoolStatus{
+				Started: m.started,
+				Uptime:  uint32(time.Now().Unix()) - m.started,
+			}
+		}
+	}
+
+	for _, d := range m.devices {
+		d.UpdateFanTemp()
+
+		averageHashRate,
+			fanPercent,
+			temperature := d.Status()
+
+		ms.Devices = append(ms.Devices, &DeviceStatus{
+			Index:             d.index,
+			DeviceName:        d.deviceName,
+			DeviceType:        d.deviceType,
+			HashRate:          averageHashRate,
+			HashRateFormatted: util.FormatHashRate(averageHashRate),
+			FanPercent:        fanPercent,
+			Temperature:       temperature,
+			Started:           d.started,
+		})
+	}
+
+	w.Header().Add("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(ms)
+}
diff --git a/sample-gominer.conf b/sample-gominer.conf
index c74e8e9..bf3f38c 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -45,6 +45,39 @@
 ; Write memory profile.
 ; memprofile=/some/path
 
+; ------------------------------------------------------------------------------
+; Status API
+; ------------------------------------------------------------------------------
+
+; Specify the interfaces for the API server listen on.
+; One listen address per line.
+; If you don't specify port, it will use default 3333.
+;
+;   All interfaces on default port:
+; apilisten=
+;   All ipv4 interfaces on default port:
+; apilisten=0.0.0.0
+;   All ipv6 interfaces on default port:
+; apilisten=::
+;   All interfaces on port 9109:
+; apilisten=:9109
+;   All ipv4 interfaces on port 9109:
+; apilisten=0.0.0.0:9109
+;   All ipv6 interfaces on port 9109:
+; apilisten=[::]:9109
+;   Only ipv4 localhost on port 9109:
+; apilisten=127.0.0.1:9109
+;   Only ipv6 localhost on port 9109:
+; apilisten=[::1]:9109
+;   Only ipv4 localhost on non-standard port 8337:
+; apilisten=127.0.0.1:8337
+;   All interfaces on non-standard port 8337:
+; apilisten=:8337
+;   All ipv4 interfaces on non-standard port 8337:
+; apilisten=0.0.0.0:8337
+;   All ipv6 interfaces on non-standard port 8337:
+; apilisten=[::]:8337
+
 ; ------------------------------------------------------------------------------
 ; RPC client settings
 ; ------------------------------------------------------------------------------
diff --git a/stratum/stratum.go b/stratum/stratum.go
index bff1fdf..3adbb34 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -57,6 +57,8 @@ type Stratum struct {
 	Diff      float64
 	Target    *big.Int
 	PoolWork  NotifyWork
+
+	Started uint32
 }
 
 // Config holdes the config options that may be used by a stratum pool.
@@ -234,6 +236,8 @@ func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string)
 		return nil, err
 	}
 
+	stratum.Started = uint32(time.Now().Unix())
+
 	return &stratum, nil
 }
 
@@ -267,6 +271,10 @@ func (s *Stratum) Reconnect() error {
 	if err != nil {
 		return nil
 	}
+
+	// If we were able to reconnect, restart counter
+	s.Started = uint32(time.Now().Unix())
+
 	return nil
 }
 

From ba9842f04305e22b22510edf84ad3e9f685c942f Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Thu, 5 Oct 2017 12:46:09 -0400
Subject: [PATCH 094/150] docs: update docs for dep (#168)

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index e9d484b..bb72afd 100644
--- a/README.md
+++ b/README.md
@@ -98,12 +98,12 @@ well.
 To download and build gominer, run:
 
 ```
-go get -u github.com/Masterminds/glide
+go get -u github.com/golang/dep/cmd/dep
 mkdir -p $GOPATH/src/github.com/decred
 cd $GOPATH/src/github.com/decred
 git clone  https://github.com/decred/gominer.git
 cd gominer
-glide install
+dep ensure
 ```
 
 For CUDA with NVIDIA Management Library (NVML) support:
@@ -135,8 +135,8 @@ go build -tags opencladl
   * This should have been done by the Go installer
 - Add the following locations to your PATH: `C:\Users\username\go\bin;C:\Go\bin`
 - Add `C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin` to your PATH (This is the latest release as of 2016-09-29)
-- `go get github.com/Masterminds/glide`
-  * You should be able to type ```glide``` and get glide's usage display.  If not, double check the steps above
+- `go get github.com/golang/dep/cmd/dep`
+  * You should be able to type ```dep``` and get dep's usage display.  If not, double check the steps above
 - `go get github.com/decred/gominer`
   * Compilation will most likely fail which can be safely ignored for now.
 - Change to the gominer directory
@@ -144,8 +144,8 @@ go build -tags opencladl
   ```cd %GOPATH%/src/github.com/decred/gominer```
   * If using git-bash
   ```cd $GOPATH/src/github.com/decred/gominer```
-- Install dependencies via glide
-  * ```glide install```
+- Install dependencies via dep
+  * ```dep ensure```
 
 #### Build Instructions
 

From 62d11fbb55c1683530f018c51cd63135298f6b88 Mon Sep 17 00:00:00 2001
From: Nicola Larosa <github@teknico.net>
Date: Fri, 20 Oct 2017 19:57:15 +0200
Subject: [PATCH 095/150] Update dependencies/imports for repo layout change

---
 Gopkg.lock | 20 +++++++++++---------
 Gopkg.toml |  4 ----
 config.go  |  2 +-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/Gopkg.lock b/Gopkg.lock
index 8e57778..1c924bc 100644
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -2,9 +2,10 @@
 
 
 [[projects]]
+  branch = "master"
   name = "github.com/agl/ed25519"
   packages = [".","edwards25519"]
-  revision = "278e1ec8e8a6e017cd07577924d6766039146ced"
+  revision = "5312a61534124124185d41f09206b9fef1d88403"
 
 [[projects]]
   branch = "master"
@@ -44,15 +45,15 @@
 
 [[projects]]
   branch = "master"
-  name = "github.com/decred/dcrd"
-  packages = ["blockchain","blockchain/internal/dbnamespace","blockchain/internal/progresslog","blockchain/stake","blockchain/stake/internal/dbnamespace","blockchain/stake/internal/ticketdb","blockchain/stake/internal/tickettreap","chaincfg","chaincfg/chainec","chaincfg/chainhash","database","dcrec/edwards","dcrec/secp256k1","dcrec/secp256k1/schnorr","txscript","wire"]
-  revision = "f11fac8134626768ef7b1f278fd4331f004bfb49"
+  name = "github.com/decred/base58"
+  packages = ["."]
+  revision = "b3520e187fa8ebe65eb74245408cf4b83e6a65d3"
 
 [[projects]]
   branch = "master"
-  name = "github.com/decred/dcrutil"
-  packages = [".","base58"]
-  revision = "ddbde93f65ab0692e54ed8a5ad325fa2e8af4daa"
+  name = "github.com/decred/dcrd"
+  packages = ["blockchain","blockchain/internal/dbnamespace","blockchain/internal/progresslog","blockchain/stake","blockchain/stake/internal/dbnamespace","blockchain/stake/internal/ticketdb","blockchain/stake/internal/tickettreap","chaincfg","chaincfg/chainec","chaincfg/chainhash","database","dcrec/edwards","dcrec/secp256k1","dcrec/secp256k1/schnorr","dcrutil","txscript","wire"]
+  revision = "f903c700a41a80f59daf21fa8489a437485dcfdb"
 
 [[projects]]
   name = "github.com/jrick/logrotate"
@@ -60,13 +61,14 @@
   revision = "4ed05ed86ef17d10ff99cce77481e0fcf6f2c7b0"
 
 [[projects]]
+  branch = "master"
   name = "golang.org/x/crypto"
   packages = ["ripemd160"]
-  revision = "adbae1b6b6fb4b02448a0fc0dbbc9ba2b95b294d"
+  revision = "9419663f5a44be8b34ca85f08abc5fe1be11f8a3"
 
 [solve-meta]
   analyzer-name = "dep"
   analyzer-version = 1
-  inputs-digest = "4919571b7c7b0f4116ae50ef832452943d99856b0f19a2577f89d4033e305825"
+  inputs-digest = "ca865d509ef76e69617ba986312229371b896b2822a4aee9c5010c0c19149b7a"
   solver-name = "gps-cdcl"
   solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
index 11d23a6..4045c9b 100644
--- a/Gopkg.toml
+++ b/Gopkg.toml
@@ -22,7 +22,3 @@
 [[constraint]]
   branch = "master"
   name = "github.com/decred/dcrd"
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/decred/dcrutil"
diff --git a/config.go b/config.go
index de29d43..082f1e3 100644
--- a/config.go
+++ b/config.go
@@ -16,7 +16,7 @@ import (
 
 	"github.com/btcsuite/btclog"
 	"github.com/btcsuite/go-flags"
-	"github.com/decred/dcrutil"
+	"github.com/decred/dcrd/dcrutil"
 )
 
 const (

From 5225f93c5269686fa87a8847e8966a709e55c5ca Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 28 Aug 2018 21:39:54 -0400
Subject: [PATCH 096/150] Use slog (#179)

---
 Gopkg.lock     | 49 ++++++++++++++++++++++++++++++++++++-------------
 Gopkg.toml     |  4 ----
 config.go      |  4 ++--
 log.go         |  9 ++++-----
 stratum/log.go | 12 ++++++------
 5 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/Gopkg.lock b/Gopkg.lock
index 1c924bc..3f9c626 100644
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -4,8 +4,11 @@
 [[projects]]
   branch = "master"
   name = "github.com/agl/ed25519"
-  packages = [".","edwards25519"]
-  revision = "5312a61534124124185d41f09206b9fef1d88403"
+  packages = [
+    ".",
+    "edwards25519"
+  ]
+  revision = "278e1ec8e8a6e017cd07577924d6766039146ced"
 
 [[projects]]
   branch = "master"
@@ -13,12 +16,6 @@
   packages = ["cu"]
   revision = "57cec7ab46da74b8ca2aa1d41e898c0646081b56"
 
-[[projects]]
-  branch = "master"
-  name = "github.com/btcsuite/btclog"
-  packages = ["."]
-  revision = "84c8d2346e9fc8c7b947e243b9c24e6df9fd206a"
-
 [[projects]]
   branch = "master"
   name = "github.com/btcsuite/go-flags"
@@ -44,16 +41,42 @@
   revision = "dee3fe6eb0e98dc774a94fc231f85baf7c29d360"
 
 [[projects]]
-  branch = "master"
   name = "github.com/decred/base58"
   packages = ["."]
-  revision = "b3520e187fa8ebe65eb74245408cf4b83e6a65d3"
+  revision = "56c501706f00d9e1cfacee19a27117e12da24734"
+  version = "v1.0.0"
 
 [[projects]]
   branch = "master"
   name = "github.com/decred/dcrd"
-  packages = ["blockchain","blockchain/internal/dbnamespace","blockchain/internal/progresslog","blockchain/stake","blockchain/stake/internal/dbnamespace","blockchain/stake/internal/ticketdb","blockchain/stake/internal/tickettreap","chaincfg","chaincfg/chainec","chaincfg/chainhash","database","dcrec/edwards","dcrec/secp256k1","dcrec/secp256k1/schnorr","dcrutil","txscript","wire"]
-  revision = "f903c700a41a80f59daf21fa8489a437485dcfdb"
+  packages = [
+    "blockchain",
+    "blockchain/internal/dbnamespace",
+    "blockchain/internal/progresslog",
+    "blockchain/stake",
+    "blockchain/stake/internal/dbnamespace",
+    "blockchain/stake/internal/ticketdb",
+    "blockchain/stake/internal/tickettreap",
+    "chaincfg",
+    "chaincfg/chainec",
+    "chaincfg/chainhash",
+    "database",
+    "dcrec",
+    "dcrec/edwards",
+    "dcrec/secp256k1",
+    "dcrec/secp256k1/schnorr",
+    "dcrjson",
+    "dcrutil",
+    "txscript",
+    "wire"
+  ]
+  revision = "56f1981127f5d65c82b6ff6eb4d73b010994dd5b"
+
+[[projects]]
+  name = "github.com/decred/slog"
+  packages = ["."]
+  revision = "fbd821ef791ba2b8ae945f5d44f4e49396d230c5"
+  version = "v1.0.0"
 
 [[projects]]
   name = "github.com/jrick/logrotate"
@@ -69,6 +92,6 @@
 [solve-meta]
   analyzer-name = "dep"
   analyzer-version = 1
-  inputs-digest = "ca865d509ef76e69617ba986312229371b896b2822a4aee9c5010c0c19149b7a"
+  inputs-digest = "4bd77619a4360b3aebea16031b238f99412eb4783863b38ed10c41d940e0c424"
   solver-name = "gps-cdcl"
   solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
index 4045c9b..d1c7fad 100644
--- a/Gopkg.toml
+++ b/Gopkg.toml
@@ -3,10 +3,6 @@
   branch = "master"
   name = "github.com/barnex/cuda5"
 
-[[constraint]]
-  branch = "master"
-  name = "github.com/btcsuite/btclog"
-
 [[constraint]]
   branch = "master"
   name = "github.com/btcsuite/go-flags"
diff --git a/config.go b/config.go
index 082f1e3..942e218 100644
--- a/config.go
+++ b/config.go
@@ -14,9 +14,9 @@ import (
 	"strings"
 	"time"
 
-	"github.com/btcsuite/btclog"
 	"github.com/btcsuite/go-flags"
 	"github.com/decred/dcrd/dcrutil"
+	"github.com/decred/slog"
 )
 
 const (
@@ -146,7 +146,7 @@ func fileExists(name string) bool {
 
 // validLogLevel returns whether or not logLevel is a valid debug log level.
 func validLogLevel(logLevel string) bool {
-	_, ok := btclog.LevelFromString(logLevel)
+	_, ok := slog.LevelFromString(logLevel)
 	return ok
 }
 
diff --git a/log.go b/log.go
index c0d03bd..5744df0 100644
--- a/log.go
+++ b/log.go
@@ -7,8 +7,7 @@ import (
 	"path/filepath"
 
 	"github.com/decred/gominer/stratum"
-
-	"github.com/btcsuite/btclog"
+	"github.com/decred/slog"
 	"github.com/jrick/logrotate/rotator"
 )
 
@@ -34,7 +33,7 @@ var (
 	// backendLog is the logging backend used to create all subsystem loggers.
 	// The backend must not be used before the log rotator has been initialized,
 	// or data races and/or nil pointer dereferences will occur.
-	backendLog = btclog.NewBackend(logWriter{})
+	backendLog = slog.NewBackend(logWriter{})
 
 	// logRotator is one of the logging outputs.  It should be closed on
 	// application shutdown.
@@ -54,7 +53,7 @@ func init() {
 	stratum.UseLogger(poolLog)
 }
 
-var subsystemLoggers = map[string]btclog.Logger{
+var subsystemLoggers = map[string]slog.Logger{
 	"MAIN": mainLog,
 	"MINR": minrLog,
 	"POOL": poolLog,
@@ -94,7 +93,7 @@ func setLogLevel(subsystemID string, logLevel string) {
 	}
 
 	// Defaults to info if the log level is invalid.
-	level, _ := btclog.LevelFromString(logLevel)
+	level, _ := slog.LevelFromString(logLevel)
 	logger.SetLevel(level)
 }
 
diff --git a/stratum/log.go b/stratum/log.go
index a256fcb..5b7c016 100644
--- a/stratum/log.go
+++ b/stratum/log.go
@@ -1,26 +1,26 @@
 // Copyright (c) 2013-2015 The btcsuite developers
-// Copyright (c) 2016 The Decred developers
+// Copyright (c) 2016-2018 The Decred developers
 // Use of this source code is governed by an ISC
 // license that can be found in the LICENSE file.
 
 package stratum
 
-import "github.com/btcsuite/btclog"
+import "github.com/decred/slog"
 
 // log is a logger that is initialized with no output filters.  This
 // means the package will not perform any logging by default until the caller
 // requests it.
-var log = btclog.Disabled
+var log = slog.Disabled
 
 // DisableLog disables all library log output.  Logging output is disabled
 // by default until either UseLogger or SetLogWriter are called.
 func DisableLog() {
-	log = btclog.Disabled
+	log = slog.Disabled
 }
 
 // UseLogger uses a specified Logger to output package logging info.
 // This should be used in preference to SetLogWriter if the caller is also
-// using btclog.
-func UseLogger(logger btclog.Logger) {
+// using slog.
+func UseLogger(logger slog.Logger) {
 	log = logger
 }

From 773947940882b4578c5cf4ce5ba7f64b6e96e26c Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 28 Aug 2018 21:56:47 -0400
Subject: [PATCH 097/150] Use testnet3 (#180)

While here, update deps and fix PowLimit on !mainnet
---
 Gopkg.lock | 15 ++++++++-------
 config.go  |  3 +++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/Gopkg.lock b/Gopkg.lock
index 3f9c626..a23de5a 100644
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -8,13 +8,13 @@
     ".",
     "edwards25519"
   ]
-  revision = "278e1ec8e8a6e017cd07577924d6766039146ced"
+  revision = "5312a61534124124185d41f09206b9fef1d88403"
 
 [[projects]]
   branch = "master"
   name = "github.com/barnex/cuda5"
   packages = ["cu"]
-  revision = "57cec7ab46da74b8ca2aa1d41e898c0646081b56"
+  revision = "da30a9b287d8f7ad210d42d911e33ef5c511544b"
 
 [[projects]]
   branch = "master"
@@ -29,16 +29,16 @@
   revision = "4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f"
 
 [[projects]]
-  branch = "master"
   name = "github.com/davecgh/go-spew"
   packages = ["spew"]
-  revision = "a476722483882dd40b8111f0eb64e1d7f43f56e4"
+  revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73"
+  version = "v1.1.1"
 
 [[projects]]
-  branch = "master"
   name = "github.com/dchest/blake256"
   packages = ["."]
   revision = "dee3fe6eb0e98dc774a94fc231f85baf7c29d360"
+  version = "v1.0.0"
 
 [[projects]]
   name = "github.com/decred/base58"
@@ -81,13 +81,14 @@
 [[projects]]
   name = "github.com/jrick/logrotate"
   packages = ["rotator"]
-  revision = "4ed05ed86ef17d10ff99cce77481e0fcf6f2c7b0"
+  revision = "a93b200c26cbae3bb09dd0dc2c7c7fe1468a034a"
+  version = "v1.0.0"
 
 [[projects]]
   branch = "master"
   name = "golang.org/x/crypto"
   packages = ["ripemd160"]
-  revision = "9419663f5a44be8b34ca85f08abc5fe1be11f8a3"
+  revision = "614d502a4dac94afa3a6ce146bd1736da82514c6"
 
 [solve-meta]
   analyzer-name = "dep"
diff --git a/config.go b/config.go
index 942e218..372b77d 100644
--- a/config.go
+++ b/config.go
@@ -15,6 +15,7 @@ import (
 	"time"
 
 	"github.com/btcsuite/go-flags"
+	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/dcrutil"
 	"github.com/decred/slog"
 )
@@ -582,8 +583,10 @@ func loadConfig() (*config, []string, error) {
 	switch {
 	case cfg.TestNet:
 		defaultRPCPort = defaultRPCPortTestNet
+		chainParams = &chaincfg.TestNet3Params
 	case cfg.SimNet:
 		defaultRPCPort = defaultRPCPortSimNet
+		chainParams = &chaincfg.SimNetParams
 	default:
 		defaultRPCPort = defaultRPCPortMainNet
 	}

From 91b01dbdb3446f52852f62df8c067ad94909ec0e Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Wed, 29 Aug 2018 11:02:01 -0400
Subject: [PATCH 098/150] Use new logrotator API (#182)

---
 .travis.yml |  2 +-
 log.go      | 15 ++++-----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9d5d192..7b05094 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,5 @@
 language: go
 go:
-  - 1.8.x
   - 1.9.x
 sudo: required
 dist: trusty
@@ -10,6 +9,7 @@ before_install:
 install:
   - go get -v github.com/golang/dep/cmd/dep
   - dep ensure
+  - go build -tags opencl
   - go get -v github.com/alecthomas/gometalinter
   - gometalinter --install
 script:
diff --git a/log.go b/log.go
index 5744df0..2559d61 100644
--- a/log.go
+++ b/log.go
@@ -2,7 +2,6 @@ package main
 
 import (
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 
@@ -17,7 +16,9 @@ type logWriter struct{}
 
 func (logWriter) Write(p []byte) (n int, err error) {
 	os.Stdout.Write(p)
-	logRotatorPipe.Write(p)
+	if logRotator != nil {
+		logRotator.Write(p)
+	}
 	return len(p), nil
 }
 
@@ -39,10 +40,6 @@ var (
 	// application shutdown.
 	logRotator *rotator.Rotator
 
-	// logRotatorPipe is the write-end pipe for writing to the log rotator.  It
-	// is written to by the Write method of the logWriter type.
-	logRotatorPipe *io.PipeWriter
-
 	mainLog = backendLog.Logger("MAIN")
 	minrLog = backendLog.Logger("MINR")
 	poolLog = backendLog.Logger("POOL")
@@ -69,17 +66,13 @@ func initLogRotator(logFile string) {
 		fmt.Fprintf(os.Stderr, "failed to create log directory: %v\n", err)
 		os.Exit(1)
 	}
-	pr, pw := io.Pipe()
-	r, err := rotator.New(pr, logFile, 10*1024, false, 3)
+	r, err := rotator.New(logFile, 10*1024, false, 3)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "failed to create file rotator: %v\n", err)
 		os.Exit(1)
 	}
 
-	go r.Run()
-
 	logRotator = r
-	logRotatorPipe = pw
 }
 
 // setLogLevel sets the logging level for provided subsystem.  Invalid

From f703453fc2c46947720579996ddc228dcff6c827 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 12 Feb 2019 14:16:47 +0000
Subject: [PATCH 099/150] Require go 1.11+ (#184)

---
 .travis.yml |  11 ++----
 Gopkg.lock  |  98 ----------------------------------------------
 Gopkg.toml  |  20 ----------
 README.md   |  15 +++----
 go.mod      |  15 +++++++
 go.sum      | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 goclean.sh  |  19 ++++-----
 7 files changed, 143 insertions(+), 146 deletions(-)
 delete mode 100644 Gopkg.lock
 delete mode 100644 Gopkg.toml
 create mode 100644 go.mod
 create mode 100644 go.sum

diff --git a/.travis.yml b/.travis.yml
index 7b05094..a37fa98 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,14 @@
 language: go
 go:
-  - 1.9.x
+  - 1.11.x
 sudo: required
 dist: trusty
 before_install:
   - sudo apt-get update
   - sudo apt-get install opencl-headers nvidia-opencl-dev
 install:
-  - go get -v github.com/golang/dep/cmd/dep
-  - dep ensure
-  - go build -tags opencl
-  - go get -v github.com/alecthomas/gometalinter
-  - gometalinter --install
+  - go get -v github.com/golangci/golangci-lint/cmd/golangci-lint
 script:
-  - export PATH=$PATH:$HOME/gopath/bin
+  - export GO111MODULE=on
+  - go build -tags opencl
   - ./goclean.sh
diff --git a/Gopkg.lock b/Gopkg.lock
deleted file mode 100644
index a23de5a..0000000
--- a/Gopkg.lock
+++ /dev/null
@@ -1,98 +0,0 @@
-# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
-
-
-[[projects]]
-  branch = "master"
-  name = "github.com/agl/ed25519"
-  packages = [
-    ".",
-    "edwards25519"
-  ]
-  revision = "5312a61534124124185d41f09206b9fef1d88403"
-
-[[projects]]
-  branch = "master"
-  name = "github.com/barnex/cuda5"
-  packages = ["cu"]
-  revision = "da30a9b287d8f7ad210d42d911e33ef5c511544b"
-
-[[projects]]
-  branch = "master"
-  name = "github.com/btcsuite/go-flags"
-  packages = ["."]
-  revision = "6c288d648c1cc1befcb90cb5511dcacf64ae8e61"
-
-[[projects]]
-  branch = "master"
-  name = "github.com/btcsuite/go-socks"
-  packages = ["socks"]
-  revision = "4720035b7bfd2a9bb130b1c184f8bbe41b6f0d0f"
-
-[[projects]]
-  name = "github.com/davecgh/go-spew"
-  packages = ["spew"]
-  revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73"
-  version = "v1.1.1"
-
-[[projects]]
-  name = "github.com/dchest/blake256"
-  packages = ["."]
-  revision = "dee3fe6eb0e98dc774a94fc231f85baf7c29d360"
-  version = "v1.0.0"
-
-[[projects]]
-  name = "github.com/decred/base58"
-  packages = ["."]
-  revision = "56c501706f00d9e1cfacee19a27117e12da24734"
-  version = "v1.0.0"
-
-[[projects]]
-  branch = "master"
-  name = "github.com/decred/dcrd"
-  packages = [
-    "blockchain",
-    "blockchain/internal/dbnamespace",
-    "blockchain/internal/progresslog",
-    "blockchain/stake",
-    "blockchain/stake/internal/dbnamespace",
-    "blockchain/stake/internal/ticketdb",
-    "blockchain/stake/internal/tickettreap",
-    "chaincfg",
-    "chaincfg/chainec",
-    "chaincfg/chainhash",
-    "database",
-    "dcrec",
-    "dcrec/edwards",
-    "dcrec/secp256k1",
-    "dcrec/secp256k1/schnorr",
-    "dcrjson",
-    "dcrutil",
-    "txscript",
-    "wire"
-  ]
-  revision = "56f1981127f5d65c82b6ff6eb4d73b010994dd5b"
-
-[[projects]]
-  name = "github.com/decred/slog"
-  packages = ["."]
-  revision = "fbd821ef791ba2b8ae945f5d44f4e49396d230c5"
-  version = "v1.0.0"
-
-[[projects]]
-  name = "github.com/jrick/logrotate"
-  packages = ["rotator"]
-  revision = "a93b200c26cbae3bb09dd0dc2c7c7fe1468a034a"
-  version = "v1.0.0"
-
-[[projects]]
-  branch = "master"
-  name = "golang.org/x/crypto"
-  packages = ["ripemd160"]
-  revision = "614d502a4dac94afa3a6ce146bd1736da82514c6"
-
-[solve-meta]
-  analyzer-name = "dep"
-  analyzer-version = 1
-  inputs-digest = "4bd77619a4360b3aebea16031b238f99412eb4783863b38ed10c41d940e0c424"
-  solver-name = "gps-cdcl"
-  solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
deleted file mode 100644
index d1c7fad..0000000
--- a/Gopkg.toml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/barnex/cuda5"
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/btcsuite/go-flags"
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/btcsuite/go-socks"
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/davecgh/go-spew"
-
-[[constraint]]
-  branch = "master"
-  name = "github.com/decred/dcrd"
diff --git a/README.md b/README.md
index bb72afd..c472fd6 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ Intel Graphics) and CUDA libraries with:
 sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
-gominer has been built successfully on Ubuntu 16.04 with go1.6.2, go1.7.1,
+gominer has been built successfully on Ubuntu 16.04 with go1.11,
 g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
 well.
 
@@ -98,12 +98,11 @@ well.
 To download and build gominer, run:
 
 ```
-go get -u github.com/golang/dep/cmd/dep
-mkdir -p $GOPATH/src/github.com/decred
-cd $GOPATH/src/github.com/decred
-git clone  https://github.com/decred/gominer.git
+go get github.com/decred/gominer
+cd $GOPATH/src/github.com/decred/gominer
 cd gominer
-dep ensure
+
+env GO111MODULE=on go build
 ```
 
 For CUDA with NVIDIA Management Library (NVML) support:
@@ -135,8 +134,6 @@ go build -tags opencladl
   * This should have been done by the Go installer
 - Add the following locations to your PATH: `C:\Users\username\go\bin;C:\Go\bin`
 - Add `C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin` to your PATH (This is the latest release as of 2016-09-29)
-- `go get github.com/golang/dep/cmd/dep`
-  * You should be able to type ```dep``` and get dep's usage display.  If not, double check the steps above
 - `go get github.com/decred/gominer`
   * Compilation will most likely fail which can be safely ignored for now.
 - Change to the gominer directory
@@ -144,8 +141,6 @@ go build -tags opencladl
   ```cd %GOPATH%/src/github.com/decred/gominer```
   * If using git-bash
   ```cd $GOPATH/src/github.com/decred/gominer```
-- Install dependencies via dep
-  * ```dep ensure```
 
 #### Build Instructions
 
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..d57fc13
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,15 @@
+module github.com/decred/gominer
+
+require (
+	github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8
+	github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c
+	github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd
+	github.com/davecgh/go-spew v1.1.1
+	github.com/decred/dcrd/blockchain v1.1.1
+	github.com/decred/dcrd/chaincfg v1.3.0
+	github.com/decred/dcrd/chaincfg/chainhash v1.0.1
+	github.com/decred/dcrd/dcrutil v1.2.0
+	github.com/decred/dcrd/wire v1.2.0
+	github.com/decred/slog v1.0.0
+	github.com/jrick/logrotate v1.0.0
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..e70b4ef
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,111 @@
+github.com/aead/siphash v0.0.0-20170329201724-e404fcfc8885/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=
+github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 h1:w1UutsfOrms1J05zt7ISrnJIXKzwaspym5BTKGx93EI=
+github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412/go.mod h1:WPjqKcmVOxf0XSf3YxCJs6N6AOSrOx3obionmG7T0y0=
+github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8 h1:lnbKU7kkMoF75PDPYaj0DLoD0p6lWtzeyXSR94PrQto=
+github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8/go.mod h1:GnBnFz4V/+kxwKFnquvOOi+IjZoVJsIUbcAVOXLCxCo=
+github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c h1:jYE+0osxEwZMwnJkGJCiQZKLhMFMR6+G8QxtFtL9/Zw=
+github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c/go.mod h1:FAMFTQ1iW6ewsFxRHGOzmtOgCNMIw1ks4L86fZ3nNaw=
+github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd h1:R/opQEbFEy9JGkIguV40SvRY1uliPX8ifOvi6ICsFCw=
+github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd/go.mod h1:HHNXQzUsZCxOoE+CPiyCTO6x34Zs86zZUiwtpXoGdtg=
+github.com/btcsuite/goleveldb v1.0.0 h1:Tvd0BfvqX9o823q1j2UZ/epQo09eJh6dTcRp79ilIN4=
+github.com/btcsuite/goleveldb v1.0.0/go.mod h1:QiK9vBlgftBg6rWQIj6wFzbPfRjiykIEhBH4obrXJ/I=
+github.com/btcsuite/snappy-go v1.0.0 h1:ZxaA6lo2EpxGddsA8JwWOcxlzRybb444sgmeJQMJGQE=
+github.com/btcsuite/snappy-go v1.0.0/go.mod h1:8woku9dyThutzjeg+3xrA5iCpBRH8XEEg3lh6TiUghc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dchest/blake256 v1.0.0 h1:6gUgI5MHdz9g0TdrgKqXsoDX+Zjxmm1Sc6OsoGru50I=
+github.com/dchest/blake256 v1.0.0/go.mod h1:xXNWCE1jsAP8DAjP+rKw2MbeqLczjI3TRx2VK+9OEYY=
+github.com/decred/base58 v1.0.0 h1:BVi1FQCThIjZ0ehG+I99NJ51o0xcc9A/fDKhmJxY6+w=
+github.com/decred/base58 v1.0.0/go.mod h1:LLY1p5e3g91byL/UO1eiZaYd+uRoVRarybgcoymu9Ks=
+github.com/decred/dcrd/blockchain v1.0.2 h1:+gJFfgv5LK+LcadyoiMln838/aU3rxDd0Smqogd6fkA=
+github.com/decred/dcrd/blockchain v1.0.2/go.mod h1:R/4XnwNOTj5IP8jQIUzrJ8zhr/7EOk09IMODwBamZoI=
+github.com/decred/dcrd/blockchain v1.1.1 h1:CWr90sZ2YLQz84EGT+X/pzU+9AZB1eXQUy+4fsJSt5w=
+github.com/decred/dcrd/blockchain v1.1.1/go.mod h1:zxi/41LgzHitpz/CZu0gxHyFHz8+ysd3lH8E3P5Uifg=
+github.com/decred/dcrd/blockchain/stake v1.0.1 h1:IYGsNZRyMUsoFtVAUjd7XIccrIQ4YIqDeNzQJCjyS8A=
+github.com/decred/dcrd/blockchain/stake v1.0.1/go.mod h1:hgoGmWMIu2LLApBbcguVpzCEEfX7M2YhuMrQdpohJzc=
+github.com/decred/dcrd/blockchain/stake v1.1.0 h1:kCxZdQ2/UfcD+XjE3wlCv0vLKWR9ZFtjbbTTpudb74o=
+github.com/decred/dcrd/blockchain/stake v1.1.0/go.mod h1:WRuaml4bcyZYza1NT3qizlLcQwMIcAQRENvZVb2t884=
+github.com/decred/dcrd/chaincfg v1.1.1 h1:qRZkiA7ucsfsQPE/G/U1OnEUFozDl1MvM4ysJCUndLU=
+github.com/decred/dcrd/chaincfg v1.1.1/go.mod h1:UlGtnp8Xx9YK+etBTybGjoFGoGXSw2bxZQuAnwfKv6I=
+github.com/decred/dcrd/chaincfg v1.2.0/go.mod h1:kpoGTMIriKn5hHRSu5b65+Q9LlGUdbQcMzGujac1BVs=
+github.com/decred/dcrd/chaincfg v1.3.0 h1:DEysyX1/kxlWbY97PTIPpGbMOp3+n2iixi3m9d27A6c=
+github.com/decred/dcrd/chaincfg v1.3.0/go.mod h1:kpoGTMIriKn5hHRSu5b65+Q9LlGUdbQcMzGujac1BVs=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.1 h1:0vG7U9+dSjSCaHQKdoSKURK2pOb47+b+8FK5q4+Je7M=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.1/go.mod h1:OVfvaOsNLS/A1y4Eod0Ip/Lf8qga7VXCQjUQLbkY0Go=
+github.com/decred/dcrd/database v1.0.1 h1:BSIerNf4RhSA0iDhiE/320RYqD2y9T+SCj99Pv7svgo=
+github.com/decred/dcrd/database v1.0.1/go.mod h1:ILCeyOHFew3fZ7K2B9jl+tp5qFOap/pEGoo6Yy6Wk0g=
+github.com/decred/dcrd/database v1.0.3 h1:e5Q3gDt9LwfvpZxYqFF3OVzgr8bGeC1cen+V3mv/CCw=
+github.com/decred/dcrd/database v1.0.3/go.mod h1:TLxRwIV8x85+dxPTLAWu4mHg45TkKrrza5xzwOS1QtA=
+github.com/decred/dcrd/dcrec v0.0.0-20180721005212-59fe2b293f69/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
+github.com/decred/dcrd/dcrec v0.0.0-20180721031028-5369a485acf6/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
+github.com/decred/dcrd/dcrec v0.0.0-20180801202239-0761de129164 h1:N5s3yVfjBNW6XNG3gLxYpvt0IUjUsp/FRfC75QpSI+E=
+github.com/decred/dcrd/dcrec v0.0.0-20180801202239-0761de129164/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
+github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721005212-59fe2b293f69/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
+github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721031028-5369a485acf6 h1:1T33paUnhZTyLN60k5DSy4CH9uTN4vQ9TdSyu4O1ox8=
+github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721031028-5369a485acf6/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
+github.com/decred/dcrd/dcrec/edwards v0.0.0-20181208004914-a0816cf4301f h1:NF7vp3nZ4MsAiXswGmE//m83jCN0lDsQrLI7IwLCTlo=
+github.com/decred/dcrd/dcrec/edwards v0.0.0-20181208004914-a0816cf4301f/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.0 h1:Le54WTGdTQv7XYXpS31uhFE8LZE7ypwsIL+FgDP2x5Q=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.0/go.mod h1:JPMFscGlgXTV684jxQNDijae2qrh0fLG7pJBimaYotE=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.1 h1:EFWVd1p0t0Y5tnsm/dJujgV0ORogRJ6vo7CMAjLseAc=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.1/go.mod h1:lhu4eZFSfTJWUnR3CFRcpD+Vta0KUAqnhTsTksHXgy0=
+github.com/decred/dcrd/dcrjson v1.0.0 h1:50DnA0XeV2JrQXoHh43TCKmH+kz2gHjZ1Mj/Pdk7Oz0=
+github.com/decred/dcrd/dcrjson v1.0.0/go.mod h1:ozddIaeF+EAvZZvFuB3zpfxhyxBGfvbt22crQh+PYuI=
+github.com/decred/dcrd/dcrutil v1.1.1 h1:zOkGiumN/JkobhAgpG/zfFgUoolGKVGYT5na1hbYUoE=
+github.com/decred/dcrd/dcrutil v1.1.1/go.mod h1:Jsttr0pEvzPAw+qay1kS1/PsbZYPyhluiNwwY6yBJS4=
+github.com/decred/dcrd/dcrutil v1.2.0 h1:Pd5Wf650g6Xu6luYDfGkh1yiUoPUAgqzRu6K+BGyJGg=
+github.com/decred/dcrd/dcrutil v1.2.0/go.mod h1:tUNHS2gj7ApeEVS8gb6O+4wJW7w3O2MSRyRdcjW1JxU=
+github.com/decred/dcrd/gcs v1.0.1/go.mod h1:YwutGzusSdJM79CJtxCo9t7WRCvnkLtWSD19TPo1i9g=
+github.com/decred/dcrd/txscript v1.0.1 h1:IMgxZFCw3AyG4EbKwywE3SDNshOSHsoUK1Wk/5GqWJ0=
+github.com/decred/dcrd/txscript v1.0.1/go.mod h1:FqUX07Y+u3cJ1eIGPoyWbJg+Wk1NTllln/TyDpx9KnY=
+github.com/decred/dcrd/txscript v1.0.2 h1:kzJZDuteyzvI15VNhtgFHxeeq210RTkFyfzN7d+1iPo=
+github.com/decred/dcrd/txscript v1.0.2/go.mod h1:hmUOHFlOjU7H6T/czt6kurWwXJvGPGKKGtXoft6w/qY=
+github.com/decred/dcrd/wire v1.1.0 h1:G+3CugtxNbToUN8RKWqm74yLfzJJ2BKMOr2RgWc4TyY=
+github.com/decred/dcrd/wire v1.1.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
+github.com/decred/dcrd/wire v1.2.0 h1:HqJVB7vcklIguzFWgRXw/WYCQ9cD3bUC5TKj53i1Hng=
+github.com/decred/dcrd/wire v1.2.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
+github.com/decred/slog v1.0.0 h1:Dl+W8O6/JH6n2xIFN2p3DNjCmjYwvrXsjlSJTQQ4MhE=
+github.com/decred/slog v1.0.0/go.mod h1:zR98rEZHSnbZ4WHZtO0iqmSZjDLKhkXfrPTZQKtAonQ=
+github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
+github.com/golang/protobuf v1.1.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
+github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
+github.com/jrick/logrotate v1.0.0 h1:lQ1bL/n9mBNeIXoTUoYRlK4dHuNJVofX9oWqBtPnSzI=
+github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw=
+github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/gomega v1.4.1 h1:PZSj/UFNaVp3KxrzHOcS7oyuWA7LoOY/77yCTEFu21U=
+github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
+golang.org/x/crypto v0.0.0-20180718160520-a2144134853f h1:lRy+hhwk7YT7MsKejxuz0C5Q1gk6p/QoPQYEmKmGFb8=
+golang.org/x/crypto v0.0.0-20180718160520-a2144134853f/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/net v0.0.0-20180719180050-a680a1efc54d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24 h1:mEsFm194MmS9vCwxFy+zwu0EU7ZkxxMD1iH++vmGdUY=
+golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f h1:wMNYb4v58l5UBM7MYRLPG6ZhfOqbKu7X5eyFl8ZhKvA=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180816055513-1c9583448a9c/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181004145325-8469e314837c h1:SJ7JoQNVl3mC7EWkkONgBWgCno8LcABIJwFMkWBC+EY=
+golang.org/x/sys v0.0.0-20181004145325-8469e314837c/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181206074257-70b957f3b65e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
+gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/goclean.sh b/goclean.sh
index 039e004..6a5ac51 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -2,18 +2,15 @@
 # The script does automatic checking on a Go package and its sub-packages, including:
 # 1. gofmt         (http://golang.org/cmd/gofmt/)
 # 2. go vet        (http://golang.org/cmd/vet)
-# 3. goimports     (https://github.com/bradfitz/goimports)
 # 4. ineffassign   (https://github.com/gordonklaus/ineffassign)
 
-# gometalinter (github.com/alecthomas/gometalinter) is used to run each each
-# static checker.
-
 set -ex
 
-# Automatic checks
-test -z "$(gometalinter --vendor --disable-all \
---enable=gofmt \
---enable=vet \
---enable=goimports \
---enable=ineffassign \
---deadline=10m ./... | tee /dev/stderr)"
+# golangci-lint (github.com/golangci/golangci-lint) is used to run each each
+# static checker.
+
+# check linters
+golangci-lint run --build-tags opencl --disable-all --deadline=10m \
+  --enable=gofmt \
+  --enable=vet \
+  --enable=ineffassign

From a773d6983a46911633ec7903d3d63fca300ce023 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Sun, 9 Feb 2020 13:39:17 -0500
Subject: [PATCH 100/150] build: replace travis ci with ci via github actions
 (#189)

---
 .github/workflows/go.yml |  28 +++++++++
 .travis.yml              |  14 -----
 config.go                |   2 +-
 getwork.go               |   3 +-
 go.mod                   |  16 +++---
 go.sum                   | 121 +++++++++++++++++++--------------------
 goclean.sh               |   1 +
 stratum/stratum.go       |   4 +-
 8 files changed, 99 insertions(+), 90 deletions(-)
 create mode 100644 .github/workflows/go.yml
 delete mode 100644 .travis.yml

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 0000000..fff56f6
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,28 @@
+name: Build and Test
+on: [push, pull_request]
+jobs:
+  build:
+    name: Go CI
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go: [1.12, 1.13]
+    steps:
+      - name: Set up Go
+        uses: actions/setup-go@v1
+        with:
+          go-version: ${{ matrix.go }}
+      - name: Check out source
+        uses: actions/checkout@v1
+      - name: Install Linters
+        run: "curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin v1.23.3"
+      - name: Build
+        env:
+          GO111MODULE: "on"
+        run: go build -tags opencl ./...
+      - name: Test
+        env:
+          GO111MODULE: "on"
+        run: |
+          export PATH=${PATH}:$(go env GOPATH)/bin
+          sh ./goclean.sh
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index a37fa98..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-language: go
-go:
-  - 1.11.x
-sudo: required
-dist: trusty
-before_install:
-  - sudo apt-get update
-  - sudo apt-get install opencl-headers nvidia-opencl-dev
-install:
-  - go get -v github.com/golangci/golangci-lint/cmd/golangci-lint
-script:
-  - export GO111MODULE=on
-  - go build -tags opencl
-  - ./goclean.sh
diff --git a/config.go b/config.go
index 372b77d..fe902ed 100644
--- a/config.go
+++ b/config.go
@@ -14,7 +14,7 @@ import (
 	"strings"
 	"time"
 
-	"github.com/btcsuite/go-flags"
+	"github.com/jessevdk/go-flags"
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/dcrutil"
 	"github.com/decred/slog"
diff --git a/getwork.go b/getwork.go
index 438437c..caa8679 100644
--- a/getwork.go
+++ b/getwork.go
@@ -17,8 +17,7 @@ import (
 	"strconv"
 	"time"
 
-	"github.com/btcsuite/go-socks/socks"
-
+	"github.com/decred/go-socks/socks"
 	"github.com/decred/gominer/stratum"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
diff --git a/go.mod b/go.mod
index d57fc13..632fddd 100644
--- a/go.mod
+++ b/go.mod
@@ -2,14 +2,16 @@ module github.com/decred/gominer
 
 require (
 	github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8
-	github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c
-	github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd
 	github.com/davecgh/go-spew v1.1.1
-	github.com/decred/dcrd/blockchain v1.1.1
-	github.com/decred/dcrd/chaincfg v1.3.0
-	github.com/decred/dcrd/chaincfg/chainhash v1.0.1
-	github.com/decred/dcrd/dcrutil v1.2.0
-	github.com/decred/dcrd/wire v1.2.0
+	github.com/decred/dcrd/blockchain v1.2.0
+	github.com/decred/dcrd/chaincfg v1.5.2
+	github.com/decred/dcrd/chaincfg/chainhash v1.0.2
+	github.com/decred/dcrd/dcrutil v1.4.0
+	github.com/decred/dcrd/wire v1.3.0
+	github.com/decred/go-socks v1.1.0
 	github.com/decred/slog v1.0.0
+	github.com/jessevdk/go-flags v1.4.0
 	github.com/jrick/logrotate v1.0.0
 )
+
+go 1.12
diff --git a/go.sum b/go.sum
index e70b4ef..0b5c6df 100644
--- a/go.sum
+++ b/go.sum
@@ -1,12 +1,7 @@
-github.com/aead/siphash v0.0.0-20170329201724-e404fcfc8885/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=
 github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 h1:w1UutsfOrms1J05zt7ISrnJIXKzwaspym5BTKGx93EI=
 github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412/go.mod h1:WPjqKcmVOxf0XSf3YxCJs6N6AOSrOx3obionmG7T0y0=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8 h1:lnbKU7kkMoF75PDPYaj0DLoD0p6lWtzeyXSR94PrQto=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8/go.mod h1:GnBnFz4V/+kxwKFnquvOOi+IjZoVJsIUbcAVOXLCxCo=
-github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c h1:jYE+0osxEwZMwnJkGJCiQZKLhMFMR6+G8QxtFtL9/Zw=
-github.com/btcsuite/go-flags v0.0.0-20150116065318-6c288d648c1c/go.mod h1:FAMFTQ1iW6ewsFxRHGOzmtOgCNMIw1ks4L86fZ3nNaw=
-github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd h1:R/opQEbFEy9JGkIguV40SvRY1uliPX8ifOvi6ICsFCw=
-github.com/btcsuite/go-socks v0.0.0-20170105172521-4720035b7bfd/go.mod h1:HHNXQzUsZCxOoE+CPiyCTO6x34Zs86zZUiwtpXoGdtg=
 github.com/btcsuite/goleveldb v1.0.0 h1:Tvd0BfvqX9o823q1j2UZ/epQo09eJh6dTcRp79ilIN4=
 github.com/btcsuite/goleveldb v1.0.0/go.mod h1:QiK9vBlgftBg6rWQIj6wFzbPfRjiykIEhBH4obrXJ/I=
 github.com/btcsuite/snappy-go v1.0.0 h1:ZxaA6lo2EpxGddsA8JwWOcxlzRybb444sgmeJQMJGQE=
@@ -16,96 +11,96 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dchest/blake256 v1.0.0 h1:6gUgI5MHdz9g0TdrgKqXsoDX+Zjxmm1Sc6OsoGru50I=
 github.com/dchest/blake256 v1.0.0/go.mod h1:xXNWCE1jsAP8DAjP+rKw2MbeqLczjI3TRx2VK+9OEYY=
+github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
 github.com/decred/base58 v1.0.0 h1:BVi1FQCThIjZ0ehG+I99NJ51o0xcc9A/fDKhmJxY6+w=
 github.com/decred/base58 v1.0.0/go.mod h1:LLY1p5e3g91byL/UO1eiZaYd+uRoVRarybgcoymu9Ks=
-github.com/decred/dcrd/blockchain v1.0.2 h1:+gJFfgv5LK+LcadyoiMln838/aU3rxDd0Smqogd6fkA=
-github.com/decred/dcrd/blockchain v1.0.2/go.mod h1:R/4XnwNOTj5IP8jQIUzrJ8zhr/7EOk09IMODwBamZoI=
-github.com/decred/dcrd/blockchain v1.1.1 h1:CWr90sZ2YLQz84EGT+X/pzU+9AZB1eXQUy+4fsJSt5w=
-github.com/decred/dcrd/blockchain v1.1.1/go.mod h1:zxi/41LgzHitpz/CZu0gxHyFHz8+ysd3lH8E3P5Uifg=
-github.com/decred/dcrd/blockchain/stake v1.0.1 h1:IYGsNZRyMUsoFtVAUjd7XIccrIQ4YIqDeNzQJCjyS8A=
-github.com/decred/dcrd/blockchain/stake v1.0.1/go.mod h1:hgoGmWMIu2LLApBbcguVpzCEEfX7M2YhuMrQdpohJzc=
-github.com/decred/dcrd/blockchain/stake v1.1.0 h1:kCxZdQ2/UfcD+XjE3wlCv0vLKWR9ZFtjbbTTpudb74o=
-github.com/decred/dcrd/blockchain/stake v1.1.0/go.mod h1:WRuaml4bcyZYza1NT3qizlLcQwMIcAQRENvZVb2t884=
-github.com/decred/dcrd/chaincfg v1.1.1 h1:qRZkiA7ucsfsQPE/G/U1OnEUFozDl1MvM4ysJCUndLU=
-github.com/decred/dcrd/chaincfg v1.1.1/go.mod h1:UlGtnp8Xx9YK+etBTybGjoFGoGXSw2bxZQuAnwfKv6I=
-github.com/decred/dcrd/chaincfg v1.2.0/go.mod h1:kpoGTMIriKn5hHRSu5b65+Q9LlGUdbQcMzGujac1BVs=
-github.com/decred/dcrd/chaincfg v1.3.0 h1:DEysyX1/kxlWbY97PTIPpGbMOp3+n2iixi3m9d27A6c=
-github.com/decred/dcrd/chaincfg v1.3.0/go.mod h1:kpoGTMIriKn5hHRSu5b65+Q9LlGUdbQcMzGujac1BVs=
+github.com/decred/dcrd/blockchain v1.2.0 h1:XiNd8lsU3marW7Z9xwctaXYm/7xakWOla/ZYJIPIG9w=
+github.com/decred/dcrd/blockchain v1.2.0/go.mod h1:U/cia18M3LOJjk30jYRIEJivI6wh8v+53+gANC2npA4=
+github.com/decred/dcrd/blockchain/stake v1.2.1 h1:Llj+mKNJEnMskeakMj62hllNVtiHF2vo7cDxsvoLVFg=
+github.com/decred/dcrd/blockchain/stake v1.2.1/go.mod h1:3YGhsM2WCwUM6o0WLGoTCUXLOOw6H7tqXtVtWlcCE/Y=
+github.com/decred/dcrd/blockchain/stake/v2 v2.0.0/go.mod h1:jv/rKMcZ87lhvVkHot/tElxeAYEUJ3mnKPHJ7WPq86U=
+github.com/decred/dcrd/blockchain/standalone v1.0.0 h1:bPkFgSV7/NeZI+ZEGhaOP+XccCUBTIJb3YTf8dMwe8g=
+github.com/decred/dcrd/blockchain/standalone v1.0.0/go.mod h1:U5lOleFSi1nL7heSdLgEtuvg0udS1p3cvHxvLJbihfE=
+github.com/decred/dcrd/chaincfg v1.5.1/go.mod h1:FukMzTjkwzjPU+hK7CqDMQe3NMbSZAYU5PAcsx1wlv0=
+github.com/decred/dcrd/chaincfg v1.5.2 h1:dd6l9rqcpxg2GF5neBmE2XxRc5Lqda45fWmN4XOJRW8=
+github.com/decred/dcrd/chaincfg v1.5.2/go.mod h1:FukMzTjkwzjPU+hK7CqDMQe3NMbSZAYU5PAcsx1wlv0=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.1 h1:0vG7U9+dSjSCaHQKdoSKURK2pOb47+b+8FK5q4+Je7M=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.1/go.mod h1:OVfvaOsNLS/A1y4Eod0Ip/Lf8qga7VXCQjUQLbkY0Go=
-github.com/decred/dcrd/database v1.0.1 h1:BSIerNf4RhSA0iDhiE/320RYqD2y9T+SCj99Pv7svgo=
-github.com/decred/dcrd/database v1.0.1/go.mod h1:ILCeyOHFew3fZ7K2B9jl+tp5qFOap/pEGoo6Yy6Wk0g=
-github.com/decred/dcrd/database v1.0.3 h1:e5Q3gDt9LwfvpZxYqFF3OVzgr8bGeC1cen+V3mv/CCw=
-github.com/decred/dcrd/database v1.0.3/go.mod h1:TLxRwIV8x85+dxPTLAWu4mHg45TkKrrza5xzwOS1QtA=
-github.com/decred/dcrd/dcrec v0.0.0-20180721005212-59fe2b293f69/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
-github.com/decred/dcrd/dcrec v0.0.0-20180721031028-5369a485acf6/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
-github.com/decred/dcrd/dcrec v0.0.0-20180801202239-0761de129164 h1:N5s3yVfjBNW6XNG3gLxYpvt0IUjUsp/FRfC75QpSI+E=
-github.com/decred/dcrd/dcrec v0.0.0-20180801202239-0761de129164/go.mod h1:cRAH1SNk8Mi9hKBc/DHbeiWz/fyO8KWZR3H7okrIuOA=
-github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721005212-59fe2b293f69/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
-github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721031028-5369a485acf6 h1:1T33paUnhZTyLN60k5DSy4CH9uTN4vQ9TdSyu4O1ox8=
-github.com/decred/dcrd/dcrec/edwards v0.0.0-20180721031028-5369a485acf6/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
-github.com/decred/dcrd/dcrec/edwards v0.0.0-20181208004914-a0816cf4301f h1:NF7vp3nZ4MsAiXswGmE//m83jCN0lDsQrLI7IwLCTlo=
-github.com/decred/dcrd/dcrec/edwards v0.0.0-20181208004914-a0816cf4301f/go.mod h1:+ehP0Hk/mesyZXttxCtBbhPX23BMpZJ1pcVBqUfbmvU=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.0 h1:Le54WTGdTQv7XYXpS31uhFE8LZE7ypwsIL+FgDP2x5Q=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.0/go.mod h1:JPMFscGlgXTV684jxQNDijae2qrh0fLG7pJBimaYotE=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.2 h1:rt5Vlq/jM3ZawwiacWjPa+smINyLRN07EO0cNBV6DGU=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.2/go.mod h1:BpbrGgrPTr3YJYRN3Bm+D9NuaFd+zGyNeIKgrhCXK60=
+github.com/decred/dcrd/chaincfg/v2 v2.0.2/go.mod h1:hpKvhLCDAD/xDZ3V1Pqpv9fIKVYYi11DyxETguazyvg=
+github.com/decred/dcrd/chaincfg/v2 v2.1.0 h1:2S7TL9YWnKDDiH5bTpp3xcBo+1gl1IXFi5KU4QwSIDk=
+github.com/decred/dcrd/chaincfg/v2 v2.1.0/go.mod h1:hpKvhLCDAD/xDZ3V1Pqpv9fIKVYYi11DyxETguazyvg=
+github.com/decred/dcrd/crypto/blake256 v1.0.0 h1:/8DMNYp9SGi5f0w7uCm6d6M4OU2rGFK09Y2A4Xv7EE0=
+github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc=
+github.com/decred/dcrd/database v1.1.0 h1:A9doThqEjOiE8NicDbMmRwr74itM47rcOzzWpy+keYU=
+github.com/decred/dcrd/database v1.1.0/go.mod h1:/c8suHgDP20weTDFpObwvNbrMMlxn2LM4Tvm377ztwQ=
+github.com/decred/dcrd/database/v2 v2.0.0/go.mod h1:Sj2lvTRB0mfSu9uD7ObfwCY/eJ954GFU/X+AndJIyfE=
+github.com/decred/dcrd/dcrec v1.0.0 h1:W+z6Es+Rai3MXYVoPAxYr5U1DGis0Co33scJ6uH2J6o=
+github.com/decred/dcrd/dcrec v1.0.0/go.mod h1:HIaqbEJQ+PDzQcORxnqen5/V1FR3B4VpIfmePklt8Q8=
+github.com/decred/dcrd/dcrec/edwards v1.0.0 h1:UDcPNzclKiJlWqV3x1Fl8xMCJrolo4PB4X9t8LwKDWU=
+github.com/decred/dcrd/dcrec/edwards v1.0.0/go.mod h1:HblVh1OfMt7xSxUL1ufjToaEvpbjpWvvTAUx4yem8BI=
 github.com/decred/dcrd/dcrec/secp256k1 v1.0.1 h1:EFWVd1p0t0Y5tnsm/dJujgV0ORogRJ6vo7CMAjLseAc=
 github.com/decred/dcrd/dcrec/secp256k1 v1.0.1/go.mod h1:lhu4eZFSfTJWUnR3CFRcpD+Vta0KUAqnhTsTksHXgy0=
-github.com/decred/dcrd/dcrjson v1.0.0 h1:50DnA0XeV2JrQXoHh43TCKmH+kz2gHjZ1Mj/Pdk7Oz0=
-github.com/decred/dcrd/dcrjson v1.0.0/go.mod h1:ozddIaeF+EAvZZvFuB3zpfxhyxBGfvbt22crQh+PYuI=
-github.com/decred/dcrd/dcrutil v1.1.1 h1:zOkGiumN/JkobhAgpG/zfFgUoolGKVGYT5na1hbYUoE=
-github.com/decred/dcrd/dcrutil v1.1.1/go.mod h1:Jsttr0pEvzPAw+qay1kS1/PsbZYPyhluiNwwY6yBJS4=
-github.com/decred/dcrd/dcrutil v1.2.0 h1:Pd5Wf650g6Xu6luYDfGkh1yiUoPUAgqzRu6K+BGyJGg=
-github.com/decred/dcrd/dcrutil v1.2.0/go.mod h1:tUNHS2gj7ApeEVS8gb6O+4wJW7w3O2MSRyRdcjW1JxU=
-github.com/decred/dcrd/gcs v1.0.1/go.mod h1:YwutGzusSdJM79CJtxCo9t7WRCvnkLtWSD19TPo1i9g=
-github.com/decred/dcrd/txscript v1.0.1 h1:IMgxZFCw3AyG4EbKwywE3SDNshOSHsoUK1Wk/5GqWJ0=
-github.com/decred/dcrd/txscript v1.0.1/go.mod h1:FqUX07Y+u3cJ1eIGPoyWbJg+Wk1NTllln/TyDpx9KnY=
-github.com/decred/dcrd/txscript v1.0.2 h1:kzJZDuteyzvI15VNhtgFHxeeq210RTkFyfzN7d+1iPo=
-github.com/decred/dcrd/txscript v1.0.2/go.mod h1:hmUOHFlOjU7H6T/czt6kurWwXJvGPGKKGtXoft6w/qY=
-github.com/decred/dcrd/wire v1.1.0 h1:G+3CugtxNbToUN8RKWqm74yLfzJJ2BKMOr2RgWc4TyY=
-github.com/decred/dcrd/wire v1.1.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.2 h1:awk7sYJ4pGWmtkiGHFfctztJjHMKGLV8jctGQhAbKe0=
+github.com/decred/dcrd/dcrec/secp256k1 v1.0.2/go.mod h1:CHTUIVfmDDd0KFVFpNX1pFVCBUegxW387nN0IGwNKR0=
+github.com/decred/dcrd/dcrutil v1.3.0/go.mod h1:7fUT70QAarhDwQK62g92uDbbYpjXlXngpy5RBiecufo=
+github.com/decred/dcrd/dcrutil v1.4.0 h1:xD5aUqysGQnsnP1c9J0kGeW8lDIwFGC3ja/gE3HnpCs=
+github.com/decred/dcrd/dcrutil v1.4.0/go.mod h1:Bs74gm1jQ9ZAbmEh9FWOEZ1HQzlMg5iPATDMzMnCMlQ=
+github.com/decred/dcrd/dcrutil/v2 v2.0.0 h1:HTqn2tZ8eqBF4y3hJwjyKBmJt16y7/HjzpE82E/crhY=
+github.com/decred/dcrd/dcrutil/v2 v2.0.0/go.mod h1:gUshVAXpd51DlcEhr51QfWL2HJGkMDM1U8chY+9VvQg=
+github.com/decred/dcrd/gcs v1.1.0/go.mod h1:yBjhj217Vw5lw3aKnCdHip7fYb9zwMos8bCy5s79M9w=
+github.com/decred/dcrd/txscript v1.1.0 h1:MwkLXdc4Yq83oeNNEQJdlBTkNlorKXn8Nd5W2JXyMZg=
+github.com/decred/dcrd/txscript v1.1.0/go.mod h1:gbcq6gpGfKddPmZSKp+17ils2cLzUqHopXf8H5rCY7Y=
+github.com/decred/dcrd/txscript/v2 v2.0.0/go.mod h1:WStcyYYJa+PHJB4XjrLDRzV96/Z4thtsu8mZoVrU6C0=
 github.com/decred/dcrd/wire v1.2.0 h1:HqJVB7vcklIguzFWgRXw/WYCQ9cD3bUC5TKj53i1Hng=
 github.com/decred/dcrd/wire v1.2.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
+github.com/decred/dcrd/wire v1.3.0 h1:X76I2/a8esUmxXmFpJpAvXEi014IA4twgwcOBeIS8lE=
+github.com/decred/dcrd/wire v1.3.0/go.mod h1:fnKGlUY2IBuqnpxx5dYRU5Oiq392OBqAuVjRVSkIoXM=
+github.com/decred/go-socks v1.1.0 h1:dnENcc0KIqQo3HSXdgboXAHgqsCIutkqq6ntQjYtm2U=
+github.com/decred/go-socks v1.1.0/go.mod h1:sDhHqkZH0X4JjSa02oYOGhcGHYp12FsY1jQ/meV8md0=
 github.com/decred/slog v1.0.0 h1:Dl+W8O6/JH6n2xIFN2p3DNjCmjYwvrXsjlSJTQQ4MhE=
 github.com/decred/slog v1.0.0/go.mod h1:zR98rEZHSnbZ4WHZtO0iqmSZjDLKhkXfrPTZQKtAonQ=
 github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
-github.com/golang/protobuf v1.1.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/jessevdk/go-flags v1.4.0 h1:4IU2WS7AumrZ/40jfhf4QVDMsQwqA7VEHozFRrGARJA=
 github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
 github.com/jrick/logrotate v1.0.0 h1:lQ1bL/n9mBNeIXoTUoYRlK4dHuNJVofX9oWqBtPnSzI=
 github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
-github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
-github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw=
 github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
+github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
 github.com/onsi/gomega v1.4.1 h1:PZSj/UFNaVp3KxrzHOcS7oyuWA7LoOY/77yCTEFu21U=
 github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
-golang.org/x/crypto v0.0.0-20180718160520-a2144134853f h1:lRy+hhwk7YT7MsKejxuz0C5Q1gk6p/QoPQYEmKmGFb8=
-golang.org/x/crypto v0.0.0-20180718160520-a2144134853f/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU=
+github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8 h1:1wopBVtVdWnn03fZelqdXTqk7U7zPQCb+T4rbU9ZEoU=
+golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/net v0.0.0-20180719180050-a680a1efc54d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24 h1:mEsFm194MmS9vCwxFy+zwu0EU7ZkxxMD1iH++vmGdUY=
-golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3 h1:0GoQqolDA55aaLxZyTzK/Y2ePZzZTUrRacwib7cNsYQ=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f h1:wMNYb4v58l5UBM7MYRLPG6ZhfOqbKu7X5eyFl8ZhKvA=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180816055513-1c9583448a9c/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20181004145325-8469e314837c h1:SJ7JoQNVl3mC7EWkkONgBWgCno8LcABIJwFMkWBC+EY=
-golang.org/x/sys v0.0.0-20181004145325-8469e314837c/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20181206074257-70b957f3b65e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
 gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/goclean.sh b/goclean.sh
index 6a5ac51..1d5837d 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -13,4 +13,5 @@ set -ex
 golangci-lint run --build-tags opencl --disable-all --deadline=10m \
   --enable=gofmt \
   --enable=vet \
+  --enable=gosimple \
   --enable=ineffassign
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 3adbb34..34944d2 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -23,12 +23,10 @@ import (
 
 	"github.com/davecgh/go-spew/spew"
 
-	"github.com/btcsuite/go-socks/socks"
-
+	"github.com/decred/go-socks/socks"
 	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/wire"
-
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )

From 0fca758befc47be629b68cc9a47590a06c6671ef Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 16 May 2023 12:02:26 -0500
Subject: [PATCH 101/150] build: update deps (#191)

* build: test latest go

* build: update deps
---
 .github/workflows/go.yml | 15 ++++---
 adl/adl.go               |  2 +
 cl/buffer.go             | 27 ++++--------
 cl/context.go            | 10 ++---
 cl/event.go              |  6 +--
 cl/event11.go            |  6 +--
 cl/queue1x.go            |  4 +-
 cl/sampler1x.go          |  3 +-
 cldevice.go              |  8 +---
 config.go                | 10 ++---
 device.go                |  9 ++--
 go.mod                   |  8 ++--
 go.sum                   | 91 +++++++---------------------------------
 goclean.sh               |  2 -
 stratum/stratum.go       |  6 +--
 15 files changed, 62 insertions(+), 145 deletions(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index fff56f6..3469d00 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -6,17 +6,21 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        go: [1.12, 1.13]
+        go: [1.15, 1.16]
     steps:
       - name: Set up Go
-        uses: actions/setup-go@v1
+        uses: actions/setup-go@v2
         with:
           go-version: ${{ matrix.go }}
       - name: Check out source
-        uses: actions/checkout@v1
+        uses: actions/checkout@v2
+      - name: Install build dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit ocl-icd-opencl-dev opencl-headers nvidia-opencl-dev
       - name: Install Linters
-        run: "curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin v1.23.3"
-      - name: Build
+        run: "curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin v1.39.0"
+      - name: Build OpenCL
         env:
           GO111MODULE: "on"
         run: go build -tags opencl ./...
@@ -24,5 +28,4 @@ jobs:
         env:
           GO111MODULE: "on"
         run: |
-          export PATH=${PATH}:$(go env GOPATH)/bin
           sh ./goclean.sh
diff --git a/adl/adl.go b/adl/adl.go
index c0695fb..26a24b1 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -1,3 +1,5 @@
+// +build opencladl,!cuda,!opencl
+
 package adl
 
 /*
diff --git a/cl/buffer.go b/cl/buffer.go
index d318fa4..efe0c84 100644
--- a/cl/buffer.go
+++ b/cl/buffer.go
@@ -16,9 +16,8 @@ func CLCreateBuffer(context CL_context,
 	errcode_ret *CL_int) CL_mem {
 
 	var c_errcode_ret C.cl_int
-	var c_memobj C.cl_mem
 
-	c_memobj = C.clCreateBuffer(context.cl_context,
+	c_memobj := C.clCreateBuffer(context.cl_context,
 		C.cl_mem_flags(flags),
 		C.size_t(size),
 		host_ptr,
@@ -38,9 +37,8 @@ func CLCreateSubBuffer(buffer CL_mem,
 	errcode_ret *CL_int) CL_mem {
 
 	var c_errcode_ret C.cl_int
-	var c_memobj C.cl_mem
 
-	c_memobj = C.clCreateSubBuffer(buffer.cl_mem,
+	c_memobj := C.clCreateSubBuffer(buffer.cl_mem,
 		C.cl_mem_flags(flags),
 		C.cl_buffer_create_type(buffer_create_type),
 		buffer_create_info,
@@ -73,8 +71,7 @@ func CLEnqueueReadBuffer(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -127,8 +124,7 @@ func CLEnqueueWriteBuffer(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -181,8 +177,7 @@ func CLEnqueueCopyBuffer(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -250,8 +245,7 @@ func CLEnqueueReadBufferRect(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -329,8 +323,7 @@ func CLEnqueueWriteBufferRect(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -407,8 +400,7 @@ func CLEnqueueCopyBufferRect(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -475,8 +467,7 @@ func CLEnqueueMapBuffer(command_queue CL_command_queue,
 	var c_ptr_ret unsafe.Pointer
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/context.go b/cl/context.go
index 00bbbac..c67d516 100644
--- a/cl/context.go
+++ b/cl/context.go
@@ -57,8 +57,7 @@ func init() {
 
 //export go_ctx_notify
 func go_ctx_notify(errinfo *C.char, private_info unsafe.Pointer, cb C.int, user_data unsafe.Pointer) {
-	var c_user_data []unsafe.Pointer
-	c_user_data = *(*[]unsafe.Pointer)(user_data)
+	c_user_data := *(*[]unsafe.Pointer)(user_data)
 	ctx_notify[c_user_data[1]](C.GoString(errinfo), private_info, int(cb), c_user_data[0])
 }
 
@@ -76,13 +75,12 @@ func CLCreateContext(properties []CL_context_properties,
 		c_errcode_ret = CL_INVALID_VALUE
 		c_context = nil
 	} else {
-		var c_properties []C.cl_context_properties
 		var c_properties_ptr *C.cl_context_properties
 		var c_devices []C.cl_device_id
 		var c_devices_ptr *C.cl_device_id
 
 		if properties != nil {
-			c_properties = make([]C.cl_context_properties, len(properties))
+			c_properties := make([]C.cl_context_properties, len(properties))
 			for i := 0; i < len(properties); i++ {
 				c_properties[i] = C.cl_context_properties(properties[i])
 			}
@@ -125,7 +123,6 @@ func CLCreateContext(properties []CL_context_properties,
 				nil,
 				nil,
 				&c_errcode_ret)
-
 		}
 	}
 
@@ -149,11 +146,10 @@ func CLCreateContextFromType(properties []CL_context_properties,
 		c_errcode_ret = CL_INVALID_VALUE
 		c_context = nil
 	} else {
-		var c_properties []C.cl_context_properties
 		var c_properties_ptr *C.cl_context_properties
 
 		if properties != nil {
-			c_properties = make([]C.cl_context_properties, len(properties))
+			c_properties := make([]C.cl_context_properties, len(properties))
 			for i := 0; i < len(properties); i++ {
 				c_properties[i] = C.cl_context_properties(properties[i])
 			}
diff --git a/cl/event.go b/cl/event.go
index fb75971..c399649 100644
--- a/cl/event.go
+++ b/cl/event.go
@@ -42,10 +42,9 @@ func go_evt_notify(event C.cl_event, event_command_exec_status C.cl_int, user_da
 func CLCreateUserEvent(context CL_context,
 	errcode_ret *CL_int) CL_event {
 
-	var c_event C.cl_event
 	var c_errcode_ret C.cl_int
 
-	c_event = C.clCreateUserEvent(context.cl_context, &c_errcode_ret)
+	c_event := C.clCreateUserEvent(context.cl_context, &c_errcode_ret)
 
 	if errcode_ret != nil {
 		*errcode_ret = CL_int(c_errcode_ret)
@@ -66,8 +65,7 @@ func CLWaitForEvents(num_events CL_uint,
 		return CL_INVALID_VALUE
 	}
 
-	var c_event_list []C.cl_event
-	c_event_list = make([]C.cl_event, len(event_list))
+	c_event_list := make([]C.cl_event, len(event_list))
 	for i := 0; i < len(event_list); i++ {
 		c_event_list[i] = event_list[i].cl_event
 	}
diff --git a/cl/event11.go b/cl/event11.go
index ef81cda..e3ec590 100644
--- a/cl/event11.go
+++ b/cl/event11.go
@@ -11,9 +11,8 @@ import "C"
 
 func CLEnqueueMarker(command_queue CL_command_queue, event *CL_event) CL_int {
 	var c_event C.cl_event
-	var c_errcode_ret C.cl_int
 
-	c_errcode_ret = C.clEnqueueMarker(command_queue.cl_command_queue, &c_event)
+	c_errcode_ret := C.clEnqueueMarker(command_queue.cl_command_queue, &c_event)
 
 	if event != nil {
 		event.cl_event = c_event
@@ -39,8 +38,7 @@ func CLEnqueueWaitForEvents(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/queue1x.go b/cl/queue1x.go
index 1ad602b..bc7b531 100644
--- a/cl/queue1x.go
+++ b/cl/queue1x.go
@@ -9,10 +9,10 @@ func CLCreateCommandQueue(context CL_context,
 	device CL_device_id,
 	properties CL_command_queue_properties,
 	errcode_ret *CL_int) CL_command_queue {
+
 	var c_errcode_ret C.cl_int
-	var c_command_queue C.cl_command_queue
 
-	c_command_queue = C.clCreateCommandQueue(context.cl_context,
+	c_command_queue := C.clCreateCommandQueue(context.cl_context,
 		device.cl_device_id,
 		C.cl_command_queue_properties(properties),
 		&c_errcode_ret)
diff --git a/cl/sampler1x.go b/cl/sampler1x.go
index f79baa2..dbb1395 100644
--- a/cl/sampler1x.go
+++ b/cl/sampler1x.go
@@ -12,9 +12,8 @@ func CLCreateSampler(context CL_context,
 	errcode_ret *CL_int) CL_sampler {
 
 	var c_errcode_ret C.cl_int
-	var c_sampler C.cl_sampler
 
-	c_sampler = C.clCreateSampler(context.cl_context,
+	c_sampler := C.clCreateSampler(context.cl_context,
 		C.cl_bool(normalized_coords),
 		C.cl_addressing_mode(addressing_mode),
 		C.cl_filter_mode(filter_mode),
diff --git a/cldevice.go b/cldevice.go
index e5e739c..c60724b 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -139,12 +139,10 @@ func fanControlSet(index int, fanCur uint32, tempTargetType string,
 	case TargetLower:
 		fanNewPercent = fanCur + fanAdjustmentPercent
 		fanNewValue = amdgpuFanPercentToValue(fanNewPercent)
-		break
 	// Increase the temperature by decreasing the fan speed
 	case TargetHigher:
 		fanNewPercent = fanCur - fanAdjustmentPercent
 		fanNewValue = amdgpuFanPercentToValue(fanNewPercent)
-		break
 	}
 
 	fanPath := amdgpuGetSysfsPath(index, "fan")
@@ -179,7 +177,7 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 	if err != nil {
 		return nil, nil, err
 	}
-	str := string(buf.Bytes())
+	str := buf.String()
 	programFinal := []byte(str)
 
 	programSize[0] = cl.CL_size_t(len(programFinal))
@@ -275,7 +273,6 @@ func determineDeviceKind(index int, deviceType string) string {
 				deviceKind = DeviceKindAMDGPU
 			}
 		}
-		break
 	}
 
 	return deviceKind
@@ -446,7 +443,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	}
 
 	// Create the program.
-	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc[:],
+	d.program = cl.CLCreateProgramWithSource(d.context, 1, progSrc,
 		progSize[:], &status)
 	if status != cl.CL_SUCCESS {
 		return nil, clError(status, "CLCreateProgramWithSource")
@@ -561,7 +558,6 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 			atomic.StoreUint32(&d.temperature, temperature)
 			d.fanTempActive = true
 		}
-		break
 	}
 
 	// Check if temperature target is specified
diff --git a/config.go b/config.go
index fe902ed..5706957 100644
--- a/config.go
+++ b/config.go
@@ -14,10 +14,10 @@ import (
 	"strings"
 	"time"
 
-	"github.com/jessevdk/go-flags"
-	"github.com/decred/dcrd/chaincfg"
-	"github.com/decred/dcrd/dcrutil"
+	"github.com/decred/dcrd/chaincfg/v3"
+	"github.com/decred/dcrd/dcrutil/v3"
 	"github.com/decred/slog"
+	"github.com/jessevdk/go-flags"
 )
 
 const (
@@ -583,10 +583,10 @@ func loadConfig() (*config, []string, error) {
 	switch {
 	case cfg.TestNet:
 		defaultRPCPort = defaultRPCPortTestNet
-		chainParams = &chaincfg.TestNet3Params
+		chainParams = chaincfg.TestNet3Params()
 	case cfg.SimNet:
 		defaultRPCPort = defaultRPCPortSimNet
-		chainParams = &chaincfg.SimNetParams
+		chainParams = chaincfg.SimNetParams()
 	default:
 		defaultRPCPort = defaultRPCPortMainNet
 	}
diff --git a/device.go b/device.go
index c3f2644..371041d 100644
--- a/device.go
+++ b/device.go
@@ -8,16 +8,16 @@ import (
 	"sync/atomic"
 	"time"
 
-	"github.com/decred/dcrd/blockchain"
-	"github.com/decred/dcrd/chaincfg"
+	"github.com/decred/dcrd/blockchain/standalone"
 	"github.com/decred/dcrd/chaincfg/chainhash"
+	"github.com/decred/dcrd/chaincfg/v3"
 
 	"github.com/decred/gominer/blake256"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
-var chainParams = &chaincfg.MainNetParams
+var chainParams = chaincfg.MainNetParams()
 var deviceLibraryInitialized = false
 
 // Constants for fan and temperature bits
@@ -279,7 +279,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 
 	// Hashes that reach this logic and fail the minimal proof of
 	// work check are considered to be hardware errors.
-	hashNum := blockchain.HashToBig(&hash)
+	hashNum := standalone.HashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
 		minrLog.Errorf("DEV #%d Hardware error found, hash %v above "+
 			"minimum target %064x", d.index, hash, d.work.Target.Bytes())
@@ -351,7 +351,6 @@ func (d *Device) UpdateFanTemp() {
 			fanPercent, temperature := deviceStats(d.index)
 			atomic.StoreUint32(&d.fanPercent, fanPercent)
 			atomic.StoreUint32(&d.temperature, temperature)
-			break
 		}
 	}
 }
diff --git a/go.mod b/go.mod
index 632fddd..7e3c917 100644
--- a/go.mod
+++ b/go.mod
@@ -3,11 +3,11 @@ module github.com/decred/gominer
 require (
 	github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8
 	github.com/davecgh/go-spew v1.1.1
-	github.com/decred/dcrd/blockchain v1.2.0
-	github.com/decred/dcrd/chaincfg v1.5.2
+	github.com/decred/dcrd/blockchain/standalone v1.0.0
 	github.com/decred/dcrd/chaincfg/chainhash v1.0.2
-	github.com/decred/dcrd/dcrutil v1.4.0
-	github.com/decred/dcrd/wire v1.3.0
+	github.com/decred/dcrd/chaincfg/v3 v3.0.0
+	github.com/decred/dcrd/dcrutil/v3 v3.0.0
+	github.com/decred/dcrd/wire v1.4.0
 	github.com/decred/go-socks v1.1.0
 	github.com/decred/slog v1.0.0
 	github.com/jessevdk/go-flags v1.4.0
diff --git a/go.sum b/go.sum
index 0b5c6df..75f082f 100644
--- a/go.sum
+++ b/go.sum
@@ -2,105 +2,42 @@ github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 h1:w1UutsfOrms1J05zt7I
 github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412/go.mod h1:WPjqKcmVOxf0XSf3YxCJs6N6AOSrOx3obionmG7T0y0=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8 h1:lnbKU7kkMoF75PDPYaj0DLoD0p6lWtzeyXSR94PrQto=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8/go.mod h1:GnBnFz4V/+kxwKFnquvOOi+IjZoVJsIUbcAVOXLCxCo=
-github.com/btcsuite/goleveldb v1.0.0 h1:Tvd0BfvqX9o823q1j2UZ/epQo09eJh6dTcRp79ilIN4=
-github.com/btcsuite/goleveldb v1.0.0/go.mod h1:QiK9vBlgftBg6rWQIj6wFzbPfRjiykIEhBH4obrXJ/I=
-github.com/btcsuite/snappy-go v1.0.0 h1:ZxaA6lo2EpxGddsA8JwWOcxlzRybb444sgmeJQMJGQE=
-github.com/btcsuite/snappy-go v1.0.0/go.mod h1:8woku9dyThutzjeg+3xrA5iCpBRH8XEEg3lh6TiUghc=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dchest/blake256 v1.0.0 h1:6gUgI5MHdz9g0TdrgKqXsoDX+Zjxmm1Sc6OsoGru50I=
 github.com/dchest/blake256 v1.0.0/go.mod h1:xXNWCE1jsAP8DAjP+rKw2MbeqLczjI3TRx2VK+9OEYY=
-github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
-github.com/decred/base58 v1.0.0 h1:BVi1FQCThIjZ0ehG+I99NJ51o0xcc9A/fDKhmJxY6+w=
-github.com/decred/base58 v1.0.0/go.mod h1:LLY1p5e3g91byL/UO1eiZaYd+uRoVRarybgcoymu9Ks=
-github.com/decred/dcrd/blockchain v1.2.0 h1:XiNd8lsU3marW7Z9xwctaXYm/7xakWOla/ZYJIPIG9w=
-github.com/decred/dcrd/blockchain v1.2.0/go.mod h1:U/cia18M3LOJjk30jYRIEJivI6wh8v+53+gANC2npA4=
-github.com/decred/dcrd/blockchain/stake v1.2.1 h1:Llj+mKNJEnMskeakMj62hllNVtiHF2vo7cDxsvoLVFg=
-github.com/decred/dcrd/blockchain/stake v1.2.1/go.mod h1:3YGhsM2WCwUM6o0WLGoTCUXLOOw6H7tqXtVtWlcCE/Y=
-github.com/decred/dcrd/blockchain/stake/v2 v2.0.0/go.mod h1:jv/rKMcZ87lhvVkHot/tElxeAYEUJ3mnKPHJ7WPq86U=
+github.com/decred/base58 v1.0.3 h1:KGZuh8d1WEMIrK0leQRM47W85KqCAdl2N+uagbctdDI=
+github.com/decred/base58 v1.0.3/go.mod h1:pXP9cXCfM2sFLb2viz2FNIdeMWmZDBKG3ZBYbiSM78E=
 github.com/decred/dcrd/blockchain/standalone v1.0.0 h1:bPkFgSV7/NeZI+ZEGhaOP+XccCUBTIJb3YTf8dMwe8g=
 github.com/decred/dcrd/blockchain/standalone v1.0.0/go.mod h1:U5lOleFSi1nL7heSdLgEtuvg0udS1p3cvHxvLJbihfE=
-github.com/decred/dcrd/chaincfg v1.5.1/go.mod h1:FukMzTjkwzjPU+hK7CqDMQe3NMbSZAYU5PAcsx1wlv0=
-github.com/decred/dcrd/chaincfg v1.5.2 h1:dd6l9rqcpxg2GF5neBmE2XxRc5Lqda45fWmN4XOJRW8=
-github.com/decred/dcrd/chaincfg v1.5.2/go.mod h1:FukMzTjkwzjPU+hK7CqDMQe3NMbSZAYU5PAcsx1wlv0=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.1 h1:0vG7U9+dSjSCaHQKdoSKURK2pOb47+b+8FK5q4+Je7M=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.1/go.mod h1:OVfvaOsNLS/A1y4Eod0Ip/Lf8qga7VXCQjUQLbkY0Go=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.2 h1:rt5Vlq/jM3ZawwiacWjPa+smINyLRN07EO0cNBV6DGU=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.2/go.mod h1:BpbrGgrPTr3YJYRN3Bm+D9NuaFd+zGyNeIKgrhCXK60=
-github.com/decred/dcrd/chaincfg/v2 v2.0.2/go.mod h1:hpKvhLCDAD/xDZ3V1Pqpv9fIKVYYi11DyxETguazyvg=
-github.com/decred/dcrd/chaincfg/v2 v2.1.0 h1:2S7TL9YWnKDDiH5bTpp3xcBo+1gl1IXFi5KU4QwSIDk=
-github.com/decred/dcrd/chaincfg/v2 v2.1.0/go.mod h1:hpKvhLCDAD/xDZ3V1Pqpv9fIKVYYi11DyxETguazyvg=
+github.com/decred/dcrd/chaincfg/v3 v3.0.0 h1:+TFbu7ZmvBwM+SZz5mrj6cun9ts/6DAL5sqnsaFBHGQ=
+github.com/decred/dcrd/chaincfg/v3 v3.0.0/go.mod h1:EspyubQ7D2w6tjP7rBGDIE7OTbuMgBjR2F2kZFnh31A=
 github.com/decred/dcrd/crypto/blake256 v1.0.0 h1:/8DMNYp9SGi5f0w7uCm6d6M4OU2rGFK09Y2A4Xv7EE0=
 github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc=
-github.com/decred/dcrd/database v1.1.0 h1:A9doThqEjOiE8NicDbMmRwr74itM47rcOzzWpy+keYU=
-github.com/decred/dcrd/database v1.1.0/go.mod h1:/c8suHgDP20weTDFpObwvNbrMMlxn2LM4Tvm377ztwQ=
-github.com/decred/dcrd/database/v2 v2.0.0/go.mod h1:Sj2lvTRB0mfSu9uD7ObfwCY/eJ954GFU/X+AndJIyfE=
+github.com/decred/dcrd/crypto/ripemd160 v1.0.1 h1:TjRL4LfftzTjXzaufov96iDAkbY2R3aTvH2YMYa1IOc=
+github.com/decred/dcrd/crypto/ripemd160 v1.0.1/go.mod h1:F0H8cjIuWTRoixr/LM3REB8obcWkmYx0gbxpQWR8RPg=
 github.com/decred/dcrd/dcrec v1.0.0 h1:W+z6Es+Rai3MXYVoPAxYr5U1DGis0Co33scJ6uH2J6o=
 github.com/decred/dcrd/dcrec v1.0.0/go.mod h1:HIaqbEJQ+PDzQcORxnqen5/V1FR3B4VpIfmePklt8Q8=
-github.com/decred/dcrd/dcrec/edwards v1.0.0 h1:UDcPNzclKiJlWqV3x1Fl8xMCJrolo4PB4X9t8LwKDWU=
-github.com/decred/dcrd/dcrec/edwards v1.0.0/go.mod h1:HblVh1OfMt7xSxUL1ufjToaEvpbjpWvvTAUx4yem8BI=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.1 h1:EFWVd1p0t0Y5tnsm/dJujgV0ORogRJ6vo7CMAjLseAc=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.1/go.mod h1:lhu4eZFSfTJWUnR3CFRcpD+Vta0KUAqnhTsTksHXgy0=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.2 h1:awk7sYJ4pGWmtkiGHFfctztJjHMKGLV8jctGQhAbKe0=
-github.com/decred/dcrd/dcrec/secp256k1 v1.0.2/go.mod h1:CHTUIVfmDDd0KFVFpNX1pFVCBUegxW387nN0IGwNKR0=
-github.com/decred/dcrd/dcrutil v1.3.0/go.mod h1:7fUT70QAarhDwQK62g92uDbbYpjXlXngpy5RBiecufo=
-github.com/decred/dcrd/dcrutil v1.4.0 h1:xD5aUqysGQnsnP1c9J0kGeW8lDIwFGC3ja/gE3HnpCs=
-github.com/decred/dcrd/dcrutil v1.4.0/go.mod h1:Bs74gm1jQ9ZAbmEh9FWOEZ1HQzlMg5iPATDMzMnCMlQ=
-github.com/decred/dcrd/dcrutil/v2 v2.0.0 h1:HTqn2tZ8eqBF4y3hJwjyKBmJt16y7/HjzpE82E/crhY=
-github.com/decred/dcrd/dcrutil/v2 v2.0.0/go.mod h1:gUshVAXpd51DlcEhr51QfWL2HJGkMDM1U8chY+9VvQg=
-github.com/decred/dcrd/gcs v1.1.0/go.mod h1:yBjhj217Vw5lw3aKnCdHip7fYb9zwMos8bCy5s79M9w=
-github.com/decred/dcrd/txscript v1.1.0 h1:MwkLXdc4Yq83oeNNEQJdlBTkNlorKXn8Nd5W2JXyMZg=
-github.com/decred/dcrd/txscript v1.1.0/go.mod h1:gbcq6gpGfKddPmZSKp+17ils2cLzUqHopXf8H5rCY7Y=
-github.com/decred/dcrd/txscript/v2 v2.0.0/go.mod h1:WStcyYYJa+PHJB4XjrLDRzV96/Z4thtsu8mZoVrU6C0=
+github.com/decred/dcrd/dcrec/edwards/v2 v2.0.1 h1:V6eqU1crZzuoFT4KG2LhaU5xDSdkHuvLQsj25wd7Wb4=
+github.com/decred/dcrd/dcrec/edwards/v2 v2.0.1/go.mod h1:d0H8xGMWbiIQP7gN3v2rByWUcuZPm9YsgmnfoxgbINc=
+github.com/decred/dcrd/dcrec/secp256k1/v3 v3.0.0 h1:sgNeV1VRMDzs6rzyPpxyM0jp317hnwiq58Filgag2xw=
+github.com/decred/dcrd/dcrec/secp256k1/v3 v3.0.0/go.mod h1:J70FGZSbzsjecRTiTzER+3f1KZLNaXkuv+yeFTKoxM8=
+github.com/decred/dcrd/dcrutil/v3 v3.0.0 h1:n6uQaTQynIhCY89XsoDk2WQqcUcnbD+zUM9rnZcIOZo=
+github.com/decred/dcrd/dcrutil/v3 v3.0.0/go.mod h1:iVsjcqVzLmYFGCZLet2H7Nq+7imV9tYcuY+0lC2mNsY=
 github.com/decred/dcrd/wire v1.2.0 h1:HqJVB7vcklIguzFWgRXw/WYCQ9cD3bUC5TKj53i1Hng=
 github.com/decred/dcrd/wire v1.2.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
-github.com/decred/dcrd/wire v1.3.0 h1:X76I2/a8esUmxXmFpJpAvXEi014IA4twgwcOBeIS8lE=
-github.com/decred/dcrd/wire v1.3.0/go.mod h1:fnKGlUY2IBuqnpxx5dYRU5Oiq392OBqAuVjRVSkIoXM=
+github.com/decred/dcrd/wire v1.4.0 h1:KmSo6eTQIvhXS0fLBQ/l7hG7QLcSJQKSwSyzSqJYDk0=
+github.com/decred/dcrd/wire v1.4.0/go.mod h1:WxC/0K+cCAnBh+SKsRjIX9YPgvrjhmE+6pZlel1G7Ro=
 github.com/decred/go-socks v1.1.0 h1:dnENcc0KIqQo3HSXdgboXAHgqsCIutkqq6ntQjYtm2U=
 github.com/decred/go-socks v1.1.0/go.mod h1:sDhHqkZH0X4JjSa02oYOGhcGHYp12FsY1jQ/meV8md0=
 github.com/decred/slog v1.0.0 h1:Dl+W8O6/JH6n2xIFN2p3DNjCmjYwvrXsjlSJTQQ4MhE=
 github.com/decred/slog v1.0.0/go.mod h1:zR98rEZHSnbZ4WHZtO0iqmSZjDLKhkXfrPTZQKtAonQ=
-github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
-github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
-github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
-github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/jessevdk/go-flags v1.4.0 h1:4IU2WS7AumrZ/40jfhf4QVDMsQwqA7VEHozFRrGARJA=
 github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
 github.com/jrick/logrotate v1.0.0 h1:lQ1bL/n9mBNeIXoTUoYRlK4dHuNJVofX9oWqBtPnSzI=
 github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
-github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw=
-github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
-github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/gomega v1.4.1 h1:PZSj/UFNaVp3KxrzHOcS7oyuWA7LoOY/77yCTEFu21U=
-github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
-github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU=
-github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8 h1:1wopBVtVdWnn03fZelqdXTqk7U7zPQCb+T4rbU9ZEoU=
-golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/net v0.0.0-20180719180050-a680a1efc54d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3 h1:0GoQqolDA55aaLxZyTzK/Y2ePZzZTUrRacwib7cNsYQ=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f h1:wMNYb4v58l5UBM7MYRLPG6ZhfOqbKu7X5eyFl8ZhKvA=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
-gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
-gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
-gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
-gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE=
-gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/goclean.sh b/goclean.sh
index 1d5837d..e0a195e 100755
--- a/goclean.sh
+++ b/goclean.sh
@@ -12,6 +12,4 @@ set -ex
 # check linters
 golangci-lint run --build-tags opencl --disable-all --deadline=10m \
   --enable=gofmt \
-  --enable=vet \
-  --enable=gosimple \
   --enable=ineffassign
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 34944d2..b7d8025 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -23,15 +23,15 @@ import (
 
 	"github.com/davecgh/go-spew/spew"
 
-	"github.com/decred/go-socks/socks"
-	"github.com/decred/dcrd/chaincfg"
 	"github.com/decred/dcrd/chaincfg/chainhash"
+	"github.com/decred/dcrd/chaincfg/v3"
 	"github.com/decred/dcrd/wire"
+	"github.com/decred/go-socks/socks"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
-var chainParams = &chaincfg.MainNetParams
+var chainParams = chaincfg.MainNetParams()
 
 // ErrStratumStaleWork indicates that the work to send to the pool was stale.
 var ErrStratumStaleWork = fmt.Errorf("Stale work, throwing away")

From 3415142b128beb33fe28bb2bb1fe90f20c5b8c94 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 20 Jun 2023 13:21:13 -0500
Subject: [PATCH 102/150] build: update deps (#193)

---
 .github/workflows/go.yml | 23 ++++++++------
 .golangci.yml            | 25 +++++++++++++++
 adl/adl.go               |  1 +
 calibrate.go             |  1 +
 cgo_flags.go             |  1 +
 cl/cl.go                 |  2 +-
 cl/context.go            |  2 +-
 cl/image.go              | 18 ++++-------
 cl/kernel.go             | 10 ++----
 cl/kernel1x.go           |  3 +-
 cl/memory.go             |  3 +-
 cl/program.go            |  2 +-
 cl/program11.go          |  6 ++--
 cladldevice.go           |  1 +
 cldevice.go              | 30 +++--------------
 config.go                | 21 +++---------
 cudakernel_static.go     |  3 +-
 cudakernel_windows.go    |  1 +
 cudevice.go              |  1 +
 device.go                |  3 +-
 getwork.go               | 21 ------------
 go.mod                   | 31 +++++++++++++-----
 go.sum                   | 69 +++++++++++++++++++++-------------------
 goclean.sh               | 15 ---------
 stratum/stratum.go       |  2 +-
 25 files changed, 134 insertions(+), 161 deletions(-)
 create mode 100644 .golangci.yml
 delete mode 100755 goclean.sh

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 3469d00..87ae3b6 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -1,31 +1,34 @@
 name: Build and Test
 on: [push, pull_request]
+permissions:
+  contents: read
+
 jobs:
   build:
     name: Go CI
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     strategy:
       matrix:
-        go: [1.15, 1.16]
+        go: ['1.19', '1.20']
     steps:
       - name: Set up Go
-        uses: actions/setup-go@v2
+        uses: actions/setup-go@fac708d6674e30b6ba41289acaab6d4b75aa0753 #v4.0.1
         with:
           go-version: ${{ matrix.go }}
       - name: Check out source
-        uses: actions/checkout@v2
+        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 #v3.5.3
       - name: Install build dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit ocl-icd-opencl-dev opencl-headers nvidia-opencl-dev
       - name: Install Linters
-        run: "curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin v1.39.0"
+        run: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.53.3
       - name: Build OpenCL
         env:
-          GO111MODULE: "on"
+          CL_TARGET_OPENCL_VERSION: "220"
+
         run: go build -tags opencl ./...
+      - name: Lint
+        run: golangci-lint -c ./.golangci.yml run
       - name: Test
-        env:
-          GO111MODULE: "on"
-        run: |
-          sh ./goclean.sh
+        run: go test -tags opencl -v ./...
diff --git a/.golangci.yml b/.golangci.yml
new file mode 100644
index 0000000..10b101b
--- /dev/null
+++ b/.golangci.yml
@@ -0,0 +1,25 @@
+run:
+  deadline: 10m
+  build-tags:
+    - opencl
+
+linters:
+  disable-all: true
+  enable:
+    - asciicheck
+    - bidichk
+    - bodyclose
+    - durationcheck
+    - exportloopref
+    - gofmt
+    - goimports
+    - gosimple
+    - grouper
+    - ineffassign
+    - misspell
+    - nosprintfhostport
+    - reassign
+    - rowserrcheck
+    - tparallel
+    - unconvert
+    - unused
diff --git a/adl/adl.go b/adl/adl.go
index 26a24b1..936ce00 100644
--- a/adl/adl.go
+++ b/adl/adl.go
@@ -1,3 +1,4 @@
+//go:build opencladl && !cuda && !opencl
 // +build opencladl,!cuda,!opencl
 
 package adl
diff --git a/calibrate.go b/calibrate.go
index c3a1d50..7e61c15 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -1,5 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
+//go:build !cuda
 // +build !cuda
 
 package main
diff --git a/cgo_flags.go b/cgo_flags.go
index dc226d2..d64e25e 100644
--- a/cgo_flags.go
+++ b/cgo_flags.go
@@ -1,5 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
+//go:build cuda && !opencl
 // +build cuda,!opencl
 
 package main
diff --git a/cl/cl.go b/cl/cl.go
index 2cb0e28..cc354db 100644
--- a/cl/cl.go
+++ b/cl/cl.go
@@ -147,7 +147,7 @@ type CL_channel_order CL_uint
 type CL_channel_type CL_uint
 type CL_mem_flags CL_bitfield
 
-//type CL_svm_mem_flags CL_bitfield
+// type CL_svm_mem_flags CL_bitfield
 type CL_mem_object_type CL_uint
 type CL_mem_info CL_uint
 type CL_mem_migration_flags CL_bitfield
diff --git a/cl/context.go b/cl/context.go
index c67d516..d618102 100644
--- a/cl/context.go
+++ b/cl/context.go
@@ -92,7 +92,7 @@ func CLCreateContext(properties []CL_context_properties,
 		if devices != nil {
 			c_devices = make([]C.cl_device_id, len(devices))
 			for i := 0; i < len(devices); i++ {
-				c_devices[i] = C.cl_device_id(devices[i].cl_device_id)
+				c_devices[i] = devices[i].cl_device_id
 			}
 			c_devices_ptr = &c_devices[0]
 		} else {
diff --git a/cl/image.go b/cl/image.go
index a65b66f..65d292f 100644
--- a/cl/image.go
+++ b/cl/image.go
@@ -88,8 +88,7 @@ func CLEnqueueMapImage(command_queue CL_command_queue,
 	}
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -163,8 +162,7 @@ func CLEnqueueCopyImageToBuffer(command_queue CL_command_queue,
 		c_region[i] = C.size_t(region[i])
 	}
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -222,8 +220,7 @@ func CLEnqueueCopyBufferToImage(command_queue CL_command_queue,
 		c_region[i] = C.size_t(region[i])
 	}
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -283,8 +280,7 @@ func CLEnqueueReadImage(command_queue CL_command_queue,
 		c_region[i] = C.size_t(region[i])
 	}
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -348,8 +344,7 @@ func CLEnqueueWriteImage(command_queue CL_command_queue,
 		c_region[i] = C.size_t(region[i])
 	}
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
@@ -412,8 +407,7 @@ func CLEnqueueCopyImage(command_queue CL_command_queue,
 		c_region[i] = C.size_t(region[i])
 	}
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/kernel.go b/cl/kernel.go
index 08719b5..32c28e5 100644
--- a/cl/kernel.go
+++ b/cl/kernel.go
@@ -21,14 +21,11 @@ func CLCreateKernel(program CL_program,
 	}
 
 	var c_errcode_ret C.cl_int
-	var c_kernel C.cl_kernel
 
-	var c_kernel_name *C.char
-
-	c_kernel_name = C.CString(string(kernel_name))
+	c_kernel_name := C.CString(string(kernel_name))
 	defer C.free(unsafe.Pointer(c_kernel_name))
 
-	c_kernel = C.clCreateKernel(program.cl_program,
+	c_kernel := C.clCreateKernel(program.cl_program,
 		c_kernel_name, &c_errcode_ret)
 
 	if errcode_ret != nil {
@@ -300,8 +297,7 @@ func CLEnqueueNDRangeKernel(command_queue CL_command_queue,
 	}
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/kernel1x.go b/cl/kernel1x.go
index 0da8d20..9d1380b 100644
--- a/cl/kernel1x.go
+++ b/cl/kernel1x.go
@@ -21,8 +21,7 @@ func CLEnqueueTask(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/memory.go b/cl/memory.go
index 5a1c6cc..83e1240 100644
--- a/cl/memory.go
+++ b/cl/memory.go
@@ -73,8 +73,7 @@ func CLEnqueueUnmapMemObject(command_queue CL_command_queue,
 	var c_errcode_ret C.cl_int
 
 	if num_events_in_wait_list != 0 {
-		var c_event_wait_list []C.cl_event
-		c_event_wait_list = make([]C.cl_event, num_events_in_wait_list)
+		c_event_wait_list := make([]C.cl_event, num_events_in_wait_list)
 		for i := 0; i < int(num_events_in_wait_list); i++ {
 			c_event_wait_list[i] = event_wait_list[i].cl_event
 		}
diff --git a/cl/program.go b/cl/program.go
index ee68491..80c13a9 100644
--- a/cl/program.go
+++ b/cl/program.go
@@ -183,7 +183,7 @@ func CLBuildProgram(program CL_program,
 
 	c_devices = make([]C.cl_device_id, len(devices))
 	for i := 0; i < len(devices); i++ {
-		c_devices[i] = C.cl_device_id(devices[i].cl_device_id)
+		c_devices[i] = devices[i].cl_device_id
 	}
 	if options != nil {
 		c_options = C.CString(string(options))
diff --git a/cl/program11.go b/cl/program11.go
index 4463f00..f03eb15 100644
--- a/cl/program11.go
+++ b/cl/program11.go
@@ -5,9 +5,9 @@ package cl
 */
 import "C"
 
-///////////////////////////////////////////////
-//OpenCL 1.1
-///////////////////////////////////////////////
+// /////////////////////////////////////////////
+// OpenCL 1.1
+// /////////////////////////////////////////////
 func CLUnloadCompiler() CL_int {
 	return CL_int(C.clUnloadCompiler())
 }
diff --git a/cladldevice.go b/cladldevice.go
index 238826c..1fc1a86 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -1,5 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
+//go:build opencladl && !cuda && !opencl
 // +build opencladl,!cuda,!opencl
 
 package main
diff --git a/cldevice.go b/cldevice.go
index c60724b..6a28350 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -1,5 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
+//go:build opencl && !cuda && !opencladl
 // +build opencl,!cuda,!opencladl
 
 package main
@@ -182,9 +183,7 @@ func loadProgramSource(filename string) ([][]byte, []cl.CL_size_t, error) {
 
 	programSize[0] = cl.CL_size_t(len(programFinal))
 	programBuffer[0] = make([]byte, programSize[0])
-	for i := range programFinal {
-		programBuffer[0][i] = programFinal[i]
-	}
+	copy(programBuffer[0], programFinal)
 
 	return programBuffer[:], programSize[:], nil
 }
@@ -205,7 +204,6 @@ type Device struct {
 
 	sync.Mutex
 	index int
-	cuda  bool
 
 	// Items for OpenCL device
 	platformID               cl.CL_platform_id
@@ -224,10 +222,6 @@ type Device struct {
 	kind                     string
 	tempTarget               uint32
 
-	//cuInput        cu.DevicePtr
-	cuInSize       int64
-	cuOutputBuffer []float64
-
 	workSize uint32
 
 	// extraNonce is the device extraNonce, where the first
@@ -329,20 +323,6 @@ func deviceStatsWriteSysfsEntry(path string, value uint32) error {
 	return nil
 }
 
-func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
-	var platformID cl.CL_platform_id
-	platformIDs, err := getCLPlatforms()
-	if err != nil {
-		return platformID, nil, fmt.Errorf("Could not get CL platforms: %v", err)
-	}
-	platformID = platformIDs[0]
-	CLdeviceIDs, err := getCLDevices(platformID)
-	if err != nil {
-		return platformID, nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
-	}
-	return platformID, CLdeviceIDs, nil
-}
-
 func getCLPlatforms() ([]cl.CL_platform_id, error) {
 	var numPlatforms cl.CL_uint
 	status := cl.CLGetPlatformIDs(0, nil, &numPlatforms)
@@ -529,12 +509,12 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		}
 		if len(cfg.WorkSizeInts) > 0 {
 			// Apply the first setting as a global setting
-			globalWorkSize = uint32(cfg.WorkSizeInts[0])
+			globalWorkSize = cfg.WorkSizeInts[0]
 
 			// Override with the per-device setting if it exists
 			for i := range cfg.WorkSizeInts {
 				if i == order {
-					globalWorkSize = uint32(cfg.WorkSizeInts[order])
+					globalWorkSize = cfg.WorkSizeInts[order]
 				}
 			}
 
@@ -568,7 +548,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		// Override with the per-device setting if it exists
 		for i := range cfg.TempTargetInts {
 			if i == order {
-				d.tempTarget = uint32(cfg.TempTargetInts[order])
+				d.tempTarget = cfg.TempTargetInts[order]
 			}
 		}
 		d.fanControlActive = true
diff --git a/config.go b/config.go
index 5706957..bbce270 100644
--- a/config.go
+++ b/config.go
@@ -15,7 +15,7 @@ import (
 	"time"
 
 	"github.com/decred/dcrd/chaincfg/v3"
-	"github.com/decred/dcrd/dcrutil/v3"
+	"github.com/decred/dcrd/dcrutil/v4"
 	"github.com/decred/slog"
 	"github.com/jessevdk/go-flags"
 )
@@ -37,7 +37,6 @@ var (
 	defaultRPCPortMainNet = "9109"
 	defaultRPCPortTestNet = "19109"
 	defaultRPCPortSimNet  = "19556"
-	defaultAPIHost        = "localhost"
 	defaultAPIPort        = "3333"
 	defaultLogDir         = filepath.Join(minerHomeDir, defaultLogDirname)
 	defaultAutocalibrate  = 500
@@ -135,16 +134,6 @@ func normalizeAddresses(addrs []string, defaultPort string) []string {
 	return removeDuplicateAddresses(addrs)
 }
 
-// filesExists reports whether the named file or directory exists.
-func fileExists(name string) bool {
-	if _, err := os.Stat(name); err != nil {
-		if os.IsNotExist(err) {
-			return false
-		}
-	}
-	return true
-}
-
 // validLogLevel returns whether or not logLevel is a valid debug log level.
 func validLogLevel(logLevel string) bool {
 	_, ok := slog.LevelFromString(logLevel)
@@ -234,10 +223,10 @@ func cleanAndExpandPath(path string) string {
 // line options.
 //
 // The configuration proceeds as follows:
-// 	1) Start with a default config with sane settings
-// 	2) Pre-parse the command line to check for an alternative config file
-// 	3) Load configuration file overwriting defaults with any specified options
-// 	4) Parse CLI options and overwrite/add any specified options
+//  1. Start with a default config with sane settings
+//  2. Pre-parse the command line to check for an alternative config file
+//  3. Load configuration file overwriting defaults with any specified options
+//  4. Parse CLI options and overwrite/add any specified options
 //
 // The above results in btcd functioning properly without any config settings
 // while still allowing the user to override settings with config files and
diff --git a/cudakernel_static.go b/cudakernel_static.go
index f34eb71..1ee560e 100644
--- a/cudakernel_static.go
+++ b/cudakernel_static.go
@@ -1,6 +1,7 @@
 // Copyright (c) 2016 The Decred developers.
 
-//+build linux,cuda darwin,cuda
+//go:build (linux && cuda) || (darwin && cuda)
+// +build linux,cuda darwin,cuda
 
 package main
 
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
index 36af76b..6ece5f3 100644
--- a/cudakernel_windows.go
+++ b/cudakernel_windows.go
@@ -1,4 +1,5 @@
 // Copyright (c) 2016 The Decred developers.
+//go:build cuda
 // +build cuda
 
 package main
diff --git a/cudevice.go b/cudevice.go
index 031b110..3d2b9c9 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -1,5 +1,6 @@
 // Copyright (c) 2016 The Decred developers.
 
+//go:build cuda && !opencl
 // +build cuda,!opencl
 
 package main
diff --git a/device.go b/device.go
index 371041d..465fe4e 100644
--- a/device.go
+++ b/device.go
@@ -8,7 +8,7 @@ import (
 	"sync/atomic"
 	"time"
 
-	"github.com/decred/dcrd/blockchain/standalone"
+	"github.com/decred/dcrd/blockchain/standalone/v2"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/chaincfg/v3"
 
@@ -18,7 +18,6 @@ import (
 )
 
 var chainParams = chaincfg.MainNetParams()
-var deviceLibraryInitialized = false
 
 // Constants for fan and temperature bits
 const (
diff --git a/getwork.go b/getwork.go
index caa8679..53d81f7 100644
--- a/getwork.go
+++ b/getwork.go
@@ -89,32 +89,11 @@ type getWorkSubmitResponseJson struct {
 	}
 }
 
-var (
-	httpClient *http.Client
-)
-
 const (
 	MaxIdleConnections int = 20
 	RequestTimeout     int = 5
 )
 
-// init HTTPClient
-func init() {
-	httpClient = createHTTPClient()
-}
-
-// createHTTPClient for connection re-use
-func createHTTPClient() *http.Client {
-	client := &http.Client{
-		Transport: &http.Transport{
-			MaxIdleConnsPerHost: MaxIdleConnections,
-		},
-		Timeout: time.Duration(RequestTimeout) * time.Second,
-	}
-
-	return client
-}
-
 // GetWork makes a getwork RPC call and returns the result (data and target)
 func GetWork() (*work.Work, error) {
 	// Generate a request to the configured RPC server.
diff --git a/go.mod b/go.mod
index 7e3c917..1f53236 100644
--- a/go.mod
+++ b/go.mod
@@ -1,17 +1,32 @@
 module github.com/decred/gominer
 
+go 1.18
+
 require (
 	github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8
 	github.com/davecgh/go-spew v1.1.1
-	github.com/decred/dcrd/blockchain/standalone v1.0.0
-	github.com/decred/dcrd/chaincfg/chainhash v1.0.2
-	github.com/decred/dcrd/chaincfg/v3 v3.0.0
-	github.com/decred/dcrd/dcrutil/v3 v3.0.0
-	github.com/decred/dcrd/wire v1.4.0
+	github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0
+	github.com/decred/dcrd/chaincfg/chainhash v1.0.4
+	github.com/decred/dcrd/chaincfg/v3 v3.2.0
+	github.com/decred/dcrd/dcrutil/v4 v4.0.1
+	github.com/decred/dcrd/wire v1.6.0
 	github.com/decred/go-socks v1.1.0
-	github.com/decred/slog v1.0.0
-	github.com/jessevdk/go-flags v1.4.0
+	github.com/decred/slog v1.2.0
+	github.com/jessevdk/go-flags v1.5.0
 	github.com/jrick/logrotate v1.0.0
 )
 
-go 1.12
+require (
+	github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 // indirect
+	github.com/dchest/siphash v1.2.3 // indirect
+	github.com/decred/base58 v1.0.5 // indirect
+	github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect
+	github.com/decred/dcrd/crypto/ripemd160 v1.0.2 // indirect
+	github.com/decred/dcrd/dcrec v1.0.1 // indirect
+	github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3 // indirect
+	github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect
+	github.com/decred/dcrd/txscript/v4 v4.1.0 // indirect
+	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
+	golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4 // indirect
+	lukechampine.com/blake3 v1.2.1 // indirect
+)
diff --git a/go.sum b/go.sum
index 75f082f..967371e 100644
--- a/go.sum
+++ b/go.sum
@@ -2,42 +2,45 @@ github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 h1:w1UutsfOrms1J05zt7I
 github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412/go.mod h1:WPjqKcmVOxf0XSf3YxCJs6N6AOSrOx3obionmG7T0y0=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8 h1:lnbKU7kkMoF75PDPYaj0DLoD0p6lWtzeyXSR94PrQto=
 github.com/barnex/cuda5 v0.0.0-20171012184954-da30a9b287d8/go.mod h1:GnBnFz4V/+kxwKFnquvOOi+IjZoVJsIUbcAVOXLCxCo=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dchest/blake256 v1.0.0 h1:6gUgI5MHdz9g0TdrgKqXsoDX+Zjxmm1Sc6OsoGru50I=
-github.com/dchest/blake256 v1.0.0/go.mod h1:xXNWCE1jsAP8DAjP+rKw2MbeqLczjI3TRx2VK+9OEYY=
-github.com/decred/base58 v1.0.3 h1:KGZuh8d1WEMIrK0leQRM47W85KqCAdl2N+uagbctdDI=
-github.com/decred/base58 v1.0.3/go.mod h1:pXP9cXCfM2sFLb2viz2FNIdeMWmZDBKG3ZBYbiSM78E=
-github.com/decred/dcrd/blockchain/standalone v1.0.0 h1:bPkFgSV7/NeZI+ZEGhaOP+XccCUBTIJb3YTf8dMwe8g=
-github.com/decred/dcrd/blockchain/standalone v1.0.0/go.mod h1:U5lOleFSi1nL7heSdLgEtuvg0udS1p3cvHxvLJbihfE=
-github.com/decred/dcrd/chaincfg/chainhash v1.0.1 h1:0vG7U9+dSjSCaHQKdoSKURK2pOb47+b+8FK5q4+Je7M=
-github.com/decred/dcrd/chaincfg/chainhash v1.0.1/go.mod h1:OVfvaOsNLS/A1y4Eod0Ip/Lf8qga7VXCQjUQLbkY0Go=
-github.com/decred/dcrd/chaincfg/chainhash v1.0.2 h1:rt5Vlq/jM3ZawwiacWjPa+smINyLRN07EO0cNBV6DGU=
-github.com/decred/dcrd/chaincfg/chainhash v1.0.2/go.mod h1:BpbrGgrPTr3YJYRN3Bm+D9NuaFd+zGyNeIKgrhCXK60=
-github.com/decred/dcrd/chaincfg/v3 v3.0.0 h1:+TFbu7ZmvBwM+SZz5mrj6cun9ts/6DAL5sqnsaFBHGQ=
-github.com/decred/dcrd/chaincfg/v3 v3.0.0/go.mod h1:EspyubQ7D2w6tjP7rBGDIE7OTbuMgBjR2F2kZFnh31A=
-github.com/decred/dcrd/crypto/blake256 v1.0.0 h1:/8DMNYp9SGi5f0w7uCm6d6M4OU2rGFK09Y2A4Xv7EE0=
-github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc=
-github.com/decred/dcrd/crypto/ripemd160 v1.0.1 h1:TjRL4LfftzTjXzaufov96iDAkbY2R3aTvH2YMYa1IOc=
-github.com/decred/dcrd/crypto/ripemd160 v1.0.1/go.mod h1:F0H8cjIuWTRoixr/LM3REB8obcWkmYx0gbxpQWR8RPg=
-github.com/decred/dcrd/dcrec v1.0.0 h1:W+z6Es+Rai3MXYVoPAxYr5U1DGis0Co33scJ6uH2J6o=
-github.com/decred/dcrd/dcrec v1.0.0/go.mod h1:HIaqbEJQ+PDzQcORxnqen5/V1FR3B4VpIfmePklt8Q8=
-github.com/decred/dcrd/dcrec/edwards/v2 v2.0.1 h1:V6eqU1crZzuoFT4KG2LhaU5xDSdkHuvLQsj25wd7Wb4=
-github.com/decred/dcrd/dcrec/edwards/v2 v2.0.1/go.mod h1:d0H8xGMWbiIQP7gN3v2rByWUcuZPm9YsgmnfoxgbINc=
-github.com/decred/dcrd/dcrec/secp256k1/v3 v3.0.0 h1:sgNeV1VRMDzs6rzyPpxyM0jp317hnwiq58Filgag2xw=
-github.com/decred/dcrd/dcrec/secp256k1/v3 v3.0.0/go.mod h1:J70FGZSbzsjecRTiTzER+3f1KZLNaXkuv+yeFTKoxM8=
-github.com/decred/dcrd/dcrutil/v3 v3.0.0 h1:n6uQaTQynIhCY89XsoDk2WQqcUcnbD+zUM9rnZcIOZo=
-github.com/decred/dcrd/dcrutil/v3 v3.0.0/go.mod h1:iVsjcqVzLmYFGCZLet2H7Nq+7imV9tYcuY+0lC2mNsY=
-github.com/decred/dcrd/wire v1.2.0 h1:HqJVB7vcklIguzFWgRXw/WYCQ9cD3bUC5TKj53i1Hng=
-github.com/decred/dcrd/wire v1.2.0/go.mod h1:/JKOsLInOJu6InN+/zH5AyCq3YDIOW/EqcffvU8fJHM=
-github.com/decred/dcrd/wire v1.4.0 h1:KmSo6eTQIvhXS0fLBQ/l7hG7QLcSJQKSwSyzSqJYDk0=
-github.com/decred/dcrd/wire v1.4.0/go.mod h1:WxC/0K+cCAnBh+SKsRjIX9YPgvrjhmE+6pZlel1G7Ro=
+github.com/dchest/siphash v1.2.3 h1:QXwFc8cFOR2dSa/gE6o/HokBMWtLUaNDVd+22aKHeEA=
+github.com/dchest/siphash v1.2.3/go.mod h1:0NvQU092bT0ipiFN++/rXm69QG9tVxLAlQHIXMPAkHc=
+github.com/decred/base58 v1.0.5 h1:hwcieUM3pfPnE/6p3J100zoRfGkQxBulZHo7GZfOqic=
+github.com/decred/base58 v1.0.5/go.mod h1:s/8lukEHFA6bUQQb/v3rjUySJ2hu+RioCzLukAVkrfw=
+github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0 h1:v3yfo66axjr3oLihct+5tLEeM9YUzvK3i/6e2Im6RO0=
+github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0/go.mod h1:JsOpl2nHhW2D2bWMEtbMuAE+mIU/Pdd1i1pmYR+2RYI=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.4 h1:zRCv6tdncLfLTKYqu7hrXvs7hW+8FO/NvwoFvGsrluU=
+github.com/decred/dcrd/chaincfg/chainhash v1.0.4/go.mod h1:hA86XxlBWwHivMvxzXTSD0ZCG/LoYsFdWnCekkTMCqY=
+github.com/decred/dcrd/chaincfg/v3 v3.2.0 h1:6WxA92AGBkycEuWvxtZMvA76FbzbkDRoK8OGbsR2muk=
+github.com/decred/dcrd/chaincfg/v3 v3.2.0/go.mod h1:2rHW1TKyFmwZTVBLoU/Cmf0oxcpBjUEegbSlBfrsriI=
+github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y=
+github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo=
+github.com/decred/dcrd/crypto/ripemd160 v1.0.2 h1:TvGTmUBHDU75OHro9ojPLK+Yv7gDl2hnUvRocRCjsys=
+github.com/decred/dcrd/crypto/ripemd160 v1.0.2/go.mod h1:uGfjDyePSpa75cSQLzNdVmWlbQMBuiJkvXw/MNKRY4M=
+github.com/decred/dcrd/dcrec v1.0.1 h1:gDzlndw0zYxM5BlaV17d7ZJV6vhRe9njPBFeg4Db2UY=
+github.com/decred/dcrd/dcrec v1.0.1/go.mod h1:CO+EJd8eHFb8WHa84C7ZBkXsNUIywaTHb+UAuI5uo6o=
+github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3 h1:l/lhv2aJCUignzls81+wvga0TFlyoZx8QxRMQgXpZik=
+github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3/go.mod h1:AKpV6+wZ2MfPRJnTbQ6NPgWrKzbe9RCIlCF/FKzMtM8=
+github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs=
+github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0=
+github.com/decred/dcrd/dcrutil/v4 v4.0.1 h1:E+d2TNbpOj0f1L9RqkZkEm1QolFjajvkzxWC5WOPf1s=
+github.com/decred/dcrd/dcrutil/v4 v4.0.1/go.mod h1:7EXyHYj8FEqY+WzMuRkF0nh32ueLqhutZDoW4eQ+KRc=
+github.com/decred/dcrd/txscript/v4 v4.1.0 h1:uEdcibIOl6BuWj3AqmXZ9xIK/qbo6lHY9aNk29FtkrU=
+github.com/decred/dcrd/txscript/v4 v4.1.0/go.mod h1:OVguPtPc4YMkgssxzP8B6XEMf/J3MB6S1JKpxgGQqi0=
+github.com/decred/dcrd/wire v1.6.0 h1:YOGwPHk4nzGr6OIwUGb8crJYWDiVLpuMxfDBCCF7s/o=
+github.com/decred/dcrd/wire v1.6.0/go.mod h1:XQ8Xv/pN/3xaDcb7sH8FBLS9cdgVctT7HpBKKGsIACk=
 github.com/decred/go-socks v1.1.0 h1:dnENcc0KIqQo3HSXdgboXAHgqsCIutkqq6ntQjYtm2U=
 github.com/decred/go-socks v1.1.0/go.mod h1:sDhHqkZH0X4JjSa02oYOGhcGHYp12FsY1jQ/meV8md0=
-github.com/decred/slog v1.0.0 h1:Dl+W8O6/JH6n2xIFN2p3DNjCmjYwvrXsjlSJTQQ4MhE=
-github.com/decred/slog v1.0.0/go.mod h1:zR98rEZHSnbZ4WHZtO0iqmSZjDLKhkXfrPTZQKtAonQ=
-github.com/jessevdk/go-flags v1.4.0 h1:4IU2WS7AumrZ/40jfhf4QVDMsQwqA7VEHozFRrGARJA=
-github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
+github.com/decred/slog v1.2.0 h1:soHAxV52B54Di3WtKLfPum9OFfWqwtf/ygf9njdfnPM=
+github.com/decred/slog v1.2.0/go.mod h1:kVXlGnt6DHy2fV5OjSeuvCJ0OmlmTF6LFpEPMu/fOY0=
+github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
+github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
 github.com/jrick/logrotate v1.0.0 h1:lQ1bL/n9mBNeIXoTUoYRlK4dHuNJVofX9oWqBtPnSzI=
 github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
+github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
+github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4 h1:EZ2mChiOa8udjfp6rRmswTbtZN/QzUQp4ptM4rnjHvc=
+golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI=
+lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1LM6k=
diff --git a/goclean.sh b/goclean.sh
deleted file mode 100755
index e0a195e..0000000
--- a/goclean.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-# The script does automatic checking on a Go package and its sub-packages, including:
-# 1. gofmt         (http://golang.org/cmd/gofmt/)
-# 2. go vet        (http://golang.org/cmd/vet)
-# 4. ineffassign   (https://github.com/gordonklaus/ineffassign)
-
-set -ex
-
-# golangci-lint (github.com/golangci/golangci-lint) is used to run each each
-# static checker.
-
-# check linters
-golangci-lint run --build-tags opencl --disable-all --deadline=10m \
-  --enable=gofmt \
-  --enable=ineffassign
diff --git a/stratum/stratum.go b/stratum/stratum.go
index b7d8025..9c80ea3 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -70,7 +70,7 @@ type Config struct {
 	Version   string
 }
 
-// NotifyWork holds all the info recieved from a mining.notify message along
+// NotifyWork holds all the info received from a mining.notify message along
 // with the Work data generate from it.
 type NotifyWork struct {
 	Clean             bool

From 0327aea870784d189ff89b0bcdc242845082f5a6 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 09:19:15 -0500
Subject: [PATCH 103/150] Add GPU support for BLAKE3 via OpenCL.

This adds all of the logic necessary to allow gominer to support
Decred's change to BLAKE3 via DCP00011.  It includes a custom optimized
BLAKE3 OpenCL kernel based on midstates for supporting GPUs that work
with OpenCL and OpenCL with ADL.

It also removes the no longer relevant code related to blake256 and
updates the README.

Finally, the following changes have also been made:

- The miner will now work with the remaining properly discovered CL
  devices when some of the other platforms in the system have no devices
- Additional nonces are now randomized such that each device in the same
  system (up to 65536) is doing different work while also helping
  prevent collisions across multiple processes and systems working on
  the same template
- A bunch of unnecessary endian swaps are no longer done
- All hashrates across the entire spectrum will now display properly and
  consistently
---
 README.md                 |   48 +-
 blake256-old.cl           | 1747 -------------------------------------
 blake256.cl               |  161 ----
 blake256/blake256block.go | 1684 -----------------------------------
 blake3.cl                 |  159 ++++
 blake3/block.go           |  208 +++++
 blake3/convert_amd64.go   |   21 +
 blake3/convert_generic.go |   33 +
 calibrate.go              |    4 +-
 cl/cl.h                   |    1 +
 cladldevice.go            |   27 +-
 cldevice.go               |   27 +-
 config.go                 |    4 +-
 cudevice.go               |   24 +-
 device.go                 |  102 ++-
 miner.go                  |   10 +-
 sample-gominer.conf       |    2 +-
 stratum/stratum.go        |    4 +-
 util/util.go              |   21 +-
 work/work.go              |    1 +
 20 files changed, 579 insertions(+), 3709 deletions(-)
 delete mode 100644 blake256-old.cl
 delete mode 100644 blake256.cl
 delete mode 100644 blake256/blake256block.go
 create mode 100644 blake3.cl
 create mode 100644 blake3/block.go
 create mode 100644 blake3/convert_amd64.go
 create mode 100644 blake3/convert_generic.go

diff --git a/README.md b/README.md
index c472fd6..66c00a8 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # gominer
 
 gominer is an application for performing Proof-of-Work (PoW) mining on the
-Decred network.  It supports solo and stratum/pool mining using CUDA and
-OpenCL devices.
+Decred network after the activation of DCP0011 using BLAKE3.  It supports solo
+and stratum/pool mining using OpenCL devices.
 
 ## Downloading
 
@@ -78,18 +78,21 @@ $ curl http://localhost:3333/
 
 #### Pre-Requisites
 
+NOTE: The CUDA support has NOT been updated yet for BLAKE3.  Matheus is working
+on adding support, so this section hasn't been modified, but it is out of date.
+
 You will either need to install CUDA for NVIDIA graphics cards or OpenCL
 library/headers that support your device such as: AMDGPU-PRO (for newer AMD
 cards), Beignet (for Intel Graphics), or Catalyst (for older AMD cards).
 
-For example, on Ubuntu 16.04 you can install the necessary OpenCL packages (for
+For example, on Ubuntu 23.04 you can install the necessary OpenCL packages (for
 Intel Graphics) and CUDA libraries with:
 
 ```
 sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
-gominer has been built successfully on Ubuntu 16.04 with go1.11,
+gominer has been built successfully on Ubuntu 23.04 with go1.21.0,
 g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
 well.
 
@@ -98,11 +101,8 @@ well.
 To download and build gominer, run:
 
 ```
-go get github.com/decred/gominer
-cd $GOPATH/src/github.com/decred/gominer
+git clone https://github.com/decred/gominer
 cd gominer
-
-env GO111MODULE=on go build
 ```
 
 For CUDA with NVIDIA Management Library (NVML) support:
@@ -129,18 +129,7 @@ go build -tags opencladl
   * Make sure to select the Git-Bash option when prompted
 - Download the MinGW-w64 installer from [https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win32/Personal Builds/mingw-builds/installer/](https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/)
   * Select the x64 toolchain and use defaults for the other questions
-- Set the environment variable GOPATH to `C:\Users\username\go`
-- Check that the GOROOT environment variable is set to C:\Go
-  * This should have been done by the Go installer
-- Add the following locations to your PATH: `C:\Users\username\go\bin;C:\Go\bin`
-- Add `C:\Program Files\mingw-w64\x84_64-6.2.0-posix-seh-rt_v5-rev1\mingw64\bin` to your PATH (This is the latest release as of 2016-09-29)
-- `go get github.com/decred/gominer`
-  * Compilation will most likely fail which can be safely ignored for now.
-- Change to the gominer directory
-  * If using the Windows Command Prompt:
-  ```cd %GOPATH%/src/github.com/decred/gominer```
-  * If using git-bash
-  ```cd $GOPATH/src/github.com/decred/gominer```
+- `git clone https://github.com/decred/gominer`
 
 #### Build Instructions
 
@@ -165,21 +154,22 @@ go build -tags opencladl
 
 ###### Pre-Requisites
 
-- Download AMD APP SDK v3.0 from [http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
-  * Samples may be unselected from the install to save space as only the libraries and headers are needed
-- Copy or Move `C:\Program Files (x86)\AMD APP SDK\3.0` to `C:\appsdk`
+- Download OpenCL SDK from [https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases/tag/1.0](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases/tag/1.0)
+- Unzip or untar the downloaded `lightOCLSDK` archive to `C:\appsdk`
   * Ensure the folders `C:\appsdk\include` and `C:\appsdk\lib` are populated
 - Change to the library directory C:\appsdk\lib\x86_64
-  * ```cd C:\appsdk\lib\x86_64```
+  * `cd /D C:\appsdk\lib\x86_64`
 - Copy and prepare the ADL library for linking
-  * ```copy c:\Windows\SysWOW64\atiadlxx.dll .```
-  * ```gendef atiadlxx.dll```
-  * ```dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def```
+  * `copy c:\Windows\SysWOW64\atiadlxx.dll .`
+  * `gendef atiadlxx.dll`
+  * `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
 
 ###### Steps
 
 - For OpenCL:
-  * ```go build -tags opencl```
+  * `cd gominer`
+  * `go build -tags opencl`
 
 - For OpenCL with AMD Device Library (ADL) support:
-  * ```go build -tags opencladl```
+  * `cd gominer`
+  * `go build -tags opencladl`
diff --git a/blake256-old.cl b/blake256-old.cl
deleted file mode 100644
index ed65d90..0000000
--- a/blake256-old.cl
+++ /dev/null
@@ -1,1747 +0,0 @@
-/*    /\\ //\            BLAKE256 14-round kernel            /\\ //\    */
-/*    \// \\/          Copyright 2015  Company Zero          \// \\/    */
-/*    /\\ //\           A complete kernel re-write           /\\ //\    */
-/*    \// \\/           with inspiration  from the           \// \\/    */
-/*    /\\ //\          Golang BLAKE256 repo over at          /\\ //\    */
-/*    \// \\/           github.com/dchest/blake256           \// \\/    */
-
-#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n)))
-
-__constant uint cst0 = 0x243F6A88UL;
-__constant uint cst1 = 0x85A308D3UL;
-__constant uint cst2 = 0x13198A2EUL;
-__constant uint cst3 = 0x03707344UL;
-__constant uint cst4 = 0xA4093822UL;
-__constant uint cst5 = 0x299F31D0UL;
-__constant uint cst6 = 0x082EFA98UL;
-__constant uint cst7 = 0xEC4E6C89UL;
-__constant uint cst8 = 0x452821E6UL;
-__constant uint cst9 = 0x38D01377UL;
-__constant uint cstA = 0xBE5466CFUL;
-__constant uint cstB = 0x34E90C6CUL;
-__constant uint cstC = 0xC0AC29B7UL;
-__constant uint cstD = 0xC97C50DDUL;
-__constant uint cstE = 0x3F84D5B5UL;
-__constant uint cstF = 0xB5470917UL;
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(
-	volatile __global uint * restrict output,
-	// Midstate
-	const uint h0,
-	const uint h1,
-	const uint h2,
-	const uint h3,
-	const uint h4,
-	const uint h5,
-	const uint h6,
-	const uint h7,
-
-	// last 52 bytes of original message
-	const uint in32,               // M[0]
-	const uint in33,               // M[1]
-	const uint in34,               // M[2]
-	// const uint in35, = nonce       M[3]
-
-	const uint in36,               // M[4]
-	const uint in37,               // M[5]
-	const uint in38,               // M[6]
-	const uint in39,               // M[7]
-
-	const uint in40,               // M[8]
-	const uint in41,               // M[9]
-	const uint in42,               // M[10]
-	const uint in43,               // M[11]
-
-	const uint in44                // M[12]
-	// in45 = padding                 M[13]
-	// in46 = padding                 M[14]
-	// in47 = padding                 M[15]
-)
-{
-	uint M0, M1, M2, M3, M4, M5, M6, M7;
-	uint M8, M9, MA, MB, MC, MD, ME, MF;
-	uint V0, V1, V2, V3, V4, V5, V6, V7;
-	uint V8, V9, VA, VB, VC, VD, VE, VF;
-	uint pre7;
-
-	/* Load the midstate and initialize */
-	V0 = h0;
-	V1 = h1;
-	V2 = h2;
-	V3 = h3;
-	V4 = h4;
-	V5 = h5;
-	V6 = h6;
-	pre7 = V7 = h7;
-	V8 = cst0;
-	V9 = cst1;
-	VA = cst2;
-	VB = cst3;
-	VC = 0xA4093D82UL;
-	VD = 0x299F3470UL;
-	VE = cst6;
-	VF = cst7;
-
-	uint nonce = get_global_id(0);
-
-	/* Load the block header and padding */
-	M0 = in32;
-	M1 = in33;
-	M2 = in34;
-	M3 = nonce;
-	M4 = in36;
-	M5 = in37;
-	M6 = in38;
-	M7 = in39;
-	M8 = in40;
-	M9 = in41;
-	MA = in42;
-	MB = in43;
-	MC = in44;
-	MD = 0x80000001UL;
-	ME = 0x00000000UL;
-	MF = 0x000005a0UL;
-
-	/* Begin the doing the 64-byte block.
-	 * This can probably be optimized to
-	 * get another 10-15% performance out.
-	*/
-
-	/* Round 1. */
-	V0 = V0 + (M0 ^ cst1);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M2 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M4 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M6 ^ cst7);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M5 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M7 ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M3 ^ cst2);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M1 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M8 ^ cst9);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MA ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (ME ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MF ^ cstE);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MB ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M9 ^ cst8);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 2. */
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M9 ^ cstF);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MD ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MF ^ cst9);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M6 ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M1 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M0 ^ cst2);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MB ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M5 ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M7 ^ cstB);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M3 ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M2 ^ cst0);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MC ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 3. */
-	V0 = V0 + (MB ^ cst8);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (MC ^ cst0);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M5 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MF ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M2 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MD ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M0 ^ cstC);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M8 ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M7 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M9 ^ cst4);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M1 ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M4 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 4. */
-	V0 = V0 + (M7 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M3 ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MB ^ cstE);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (ME ^ cstB);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M1 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M9 ^ cst7);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M2 ^ cst6);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M5 ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M4 ^ cst0);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MF ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M0 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M8 ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MA ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M6 ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 5. */
-	V0 = V0 + (M9 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M5 ^ cst7);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M2 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MA ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M4 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MF ^ cstA);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M7 ^ cst5);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M0 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (ME ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MB ^ cstC);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M6 ^ cst8);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M3 ^ cstD);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M8 ^ cst6);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MD ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MC ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M1 ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 6. */
-	V0 = V0 + (M2 ^ cstC);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M6 ^ cstA);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M0 ^ cstB);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M8 ^ cst3);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MB ^ cst0);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M3 ^ cst8);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (MA ^ cst6);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MC ^ cst2);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M4 ^ cstD);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M7 ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MF ^ cstE);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M1 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (ME ^ cstF);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M9 ^ cst1);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M5 ^ cst7);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MD ^ cst4);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 7. */
-	V0 = V0 + (MC ^ cst5);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M1 ^ cstF);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (ME ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M4 ^ cstA);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MD ^ cstE);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MA ^ cst4);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (MF ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M5 ^ cstC);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M0 ^ cst7);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M9 ^ cst2);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M8 ^ cstB);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M2 ^ cst9);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MB ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M7 ^ cst0);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 8. */
-	V0 = V0 + (MD ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M7 ^ cstE);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MC ^ cst1);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M3 ^ cst9);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M1 ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M9 ^ cst3);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (ME ^ cst7);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MB ^ cstD);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M5 ^ cst0);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MF ^ cst4);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M8 ^ cst6);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M2 ^ cstA);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M6 ^ cst8);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MA ^ cst2);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M4 ^ cstF);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M0 ^ cst5);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 9. */
-	V0 = V0 + (M6 ^ cstF);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (ME ^ cst9);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MB ^ cst3);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M0 ^ cst8);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M3 ^ cstB);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M8 ^ cst0);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M9 ^ cstE);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MF ^ cst6);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MC ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MD ^ cst7);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M1 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MA ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M4 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M5 ^ cstA);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M7 ^ cstD);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M2 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 10. */
-	V0 = V0 + (MA ^ cst2);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M7 ^ cst6);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M1 ^ cst5);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M6 ^ cst7);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M5 ^ cst1);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M2 ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MF ^ cstB);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M9 ^ cstE);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M3 ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MD ^ cst0);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MC ^ cst3);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M0 ^ cstD);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (ME ^ cst9);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MB ^ cstF);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 11. */
-	V0 = V0 + (M0 ^ cst1);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M2 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M4 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (M6 ^ cst7);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M5 ^ cst4);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M7 ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M3 ^ cst2);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M1 ^ cst0);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M8 ^ cst9);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (MA ^ cstB);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (ME ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (MF ^ cstE);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MB ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M9 ^ cst8);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 12. */
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M4 ^ cst8);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M9 ^ cstF);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MD ^ cst6);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MF ^ cst9);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (M6 ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M8 ^ cst4);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M1 ^ cstC);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M0 ^ cst2);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (MB ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M5 ^ cst3);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M7 ^ cstB);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M3 ^ cst5);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M2 ^ cst0);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (MC ^ cst1);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 13. */
-	V0 = V0 + (MB ^ cst8);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (MC ^ cst0);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (M5 ^ cst2);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MF ^ cstD);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (M2 ^ cst5);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (MD ^ cstF);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M0 ^ cstC);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M8 ^ cstB);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (MA ^ cstE);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M3 ^ cst6);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M7 ^ cst1);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (M9 ^ cst4);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M1 ^ cst7);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M4 ^ cst9);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (M6 ^ cst3);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (ME ^ cstA);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* Round 14. */
-	V0 = V0 + (M7 ^ cst9);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 16);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 12);
-	V1 = V1 + (M3 ^ cst1);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 16);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 12);
-	V2 = V2 + (MD ^ cstC);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 16);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 12);
-	V3 = V3 + (MB ^ cstE);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 16);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 12);
-	V2 = V2 + (MC ^ cstD);
-	V2 = V2 + V6;
-	VE = VE ^ V2;
-	VE = SPH_ROTR32(VE, 8);
-	VA = VA + VE;
-	V6 = V6 ^ VA;
-	V6 = SPH_ROTR32(V6, 7);
-	V3 = V3 + (ME ^ cstB);
-	V3 = V3 + V7;
-	VF = VF ^ V3;
-	VF = SPH_ROTR32(VF, 8);
-	VB = VB + VF;
-	V7 = V7 ^ VB;
-	V7 = SPH_ROTR32(V7, 7);
-	V1 = V1 + (M1 ^ cst3);
-	V1 = V1 + V5;
-	VD = VD ^ V1;
-	VD = SPH_ROTR32(VD, 8);
-	V9 = V9 + VD;
-	V5 = V5 ^ V9;
-	V5 = SPH_ROTR32(V5, 7);
-	V0 = V0 + (M9 ^ cst7);
-	V0 = V0 + V4;
-	VC = VC ^ V0;
-	VC = SPH_ROTR32(VC, 8);
-	V8 = V8 + VC;
-	V4 = V4 ^ V8;
-	V4 = SPH_ROTR32(V4, 7);
-	V0 = V0 + (M2 ^ cst6);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 16);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 12);
-	V1 = V1 + (M5 ^ cstA);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 16);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 12);
-	V2 = V2 + (M4 ^ cst0);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 16);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 12);
-	V3 = V3 + (MF ^ cst8);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 16);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 12);
-	V2 = V2 + (M0 ^ cst4);
-	V2 = V2 + V7;
-	VD = VD ^ V2;
-	VD = SPH_ROTR32(VD, 8);
-	V8 = V8 + VD;
-	V7 = V7 ^ V8;
-	V7 = SPH_ROTR32(V7, 7);
-	V3 = V3 + (M8 ^ cstF);
-	V3 = V3 + V4;
-	VE = VE ^ V3;
-	VE = SPH_ROTR32(VE, 8);
-	V9 = V9 + VE;
-	V4 = V4 ^ V9;
-	V4 = SPH_ROTR32(V4, 7);
-	V1 = V1 + (MA ^ cst5);
-	V1 = V1 + V6;
-	VC = VC ^ V1;
-	VC = SPH_ROTR32(VC, 8);
-	VB = VB + VC;
-	V6 = V6 ^ VB;
-	V6 = SPH_ROTR32(V6, 7);
-	V0 = V0 + (M6 ^ cst2);
-	V0 = V0 + V5;
-	VF = VF ^ V0;
-	VF = SPH_ROTR32(VF, 8);
-	VA = VA + VF;
-	V5 = V5 ^ VA;
-	V5 = SPH_ROTR32(V5, 7);
-
-	/* The final chunks of the hash
-	 * are calculated as:
-	 * h0 = h0 ^ V0 ^ V8;
-	 * h1 = h1 ^ V1 ^ V9;
-	 * h2 = h2 ^ V2 ^ VA;
-	 * h3 = h3 ^ V3 ^ VB;
-	 * h4 = h4 ^ V4 ^ VC;
-	 * h5 = h5 ^ V5 ^ VD;
-	 * h6 = h6 ^ V6 ^ VE;
-	 * h7 = h7 ^ V7 ^ VF;
-	 *
-	 * We just check if the last byte
-	 * is zeroed and if it is, we tell
-	 * cgminer that we've found a
-	 * and to check it against the
-	 * target.
-	*/
-
-	/* Debug code to help you assess the correctness
-	 * of your hashing function in case someone decides
-	 * to try to optimize.
-	if (!((pre7 ^ V7 ^ VF) & 0xFFFF0000)) {
-		printf("hash on gpu %x %x %x %x %x %x %x %x\n",
-			h0 ^ V0 ^ V8,
-			h1 ^ V1 ^ V9,
-			h2 ^ V2 ^ VA,
-			h3 ^ V3 ^ VB,
-			h4 ^ V4 ^ VC,
-			h5 ^ V5 ^ VD,
-			h6 ^ V6 ^ VE,
-			h7 ^ V7 ^ VF);
-		printf("nonce for hash on gpu %x\n",
-			nonce);
-	}
-	*/
-
-	if (pre7 ^ V7 ^ VF) return;
-
-	/* Push this share */
-	output[++output[0]] = nonce;
-}
diff --git a/blake256.cl b/blake256.cl
deleted file mode 100644
index 148b1e0..0000000
--- a/blake256.cl
+++ /dev/null
@@ -1,161 +0,0 @@
-/**
- * BLAKE256 14-round kernel
- *
- * Copyright 2015 Company Zero
- * A complete kernel re-write
- * with inspiration from the Golang BLAKE256 repo (github.com/dchest/blake256)
- */
-
-/**
- * optimized by tpruvot 02/2016 :
- *
- * GTX 960 | (5s):735.3M (avg):789.3Mh/s
- * GTX 750 | (5s):443.3M (avg):476.8Mh/s
- * to
- * GTX 960 | (5s):875.0M (avg):899.2Mh/s
- * GTX 750 | (5s):523.1M (avg):536.8Mh/s
- */
-#define ROTR(v,n) rotate(v,(uint)(32U-n))
-#define ROTL(v,n) rotate(v, n)
-
-#ifdef _AMD_OPENCL
-#define SWAP(v)   rotate(v, 16U)
-#define ROTR8(v)  rotate(v, 24U)
-#else
-#define SWAP(v)  as_uint(as_uchar4(v).zwxy)
-#define ROTR8(v) as_uint(as_uchar4(v).yzwx)
-#endif
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(
-	volatile __global uint * restrict output,
-	// Midstate
-	const uint h0,
-	const uint h1,
-	const uint h2,
-	const uint h3,
-	const uint h4,
-	const uint h5,
-	const uint h6,
-	const uint h7,
-
-	// last 52 bytes of data
-	const uint M0,
-	const uint M1,
-	const uint M2,
-	// const uint M3 : nonce
-	const uint M4,
-	const uint M5,
-	const uint M6,
-	const uint M7,
-	const uint M8,
-	const uint M9,
-	const uint MA,
-	const uint MB,
-	const uint MC
-)
-{	
-	// Load the block header and padding.
-	const uint M3 = get_global_id(0);
-	const uint MD = 0x80000001UL;
-	const uint ME = 0x00000000UL;
-	const uint MF = 0x000005a0UL;
-
-	const uint cst0 = 0x243F6A88UL;
-	const uint cst1 = 0x85A308D3UL;
-	const uint cst2 = 0x13198A2EUL;
-	const uint cst3 = 0x03707344UL;
-	const uint cst4 = 0xA4093822UL;
-	const uint cst5 = 0x299F31D0UL;
-	const uint cst6 = 0x082EFA98UL;
-	const uint cst7 = 0xEC4E6C89UL;
-	const uint cst8 = 0x452821E6UL;
-	const uint cst9 = 0x38D01377UL;
-	const uint cstA = 0xBE5466CFUL;
-	const uint cstB = 0x34E90C6CUL;
-	const uint cstC = 0xC0AC29B7UL;
-	const uint cstD = 0xC97C50DDUL;
-	const uint cstE = 0x3F84D5B5UL;
-	const uint cstF = 0xB5470917UL;
-
-	uint V0, V1, V2, V3, V4, V5, V6, V7;
-	uint V8, V9, VA, VB, VC, VD, VE, VF;
-	uint pre7;
-
-	// Load the midstate and initialize.
-	V0 = h0;
-	V1 = h1;
-	V2 = h2;
-	V3 = h3;
-	V4 = h4;
-	V5 = h5;
-	V6 = h6;
-	pre7 = V7 = h7;
-
-	V8 = cst0;
-	V9 = cst1;
-	VA = cst2;
-	VB = cst3;
-	VC = 0xA4093D82UL;
-	VD = 0x299F3470UL;
-	VE = cst6;
-	VF = cst7;
-
-	// 14 rounds. 
-	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M9 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M2 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MA ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M4 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MF ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M0 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (ME ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MB ^ cstC); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M3 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MD ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MC ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M1 ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M2 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M6 ^ cstA); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M0 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M8 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MB ^ cst0); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M3 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MA ^ cst6); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M4 ^ cstD); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M7 ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MF ^ cstE); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M1 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (ME ^ cstF); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M9 ^ cst1); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M5 ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MD ^ cst4); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (MC ^ cst5); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M1 ^ cstF); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (ME ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M4 ^ cstA); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MD ^ cstE); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MA ^ cst4); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (MF ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M5 ^ cstC); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M0 ^ cst7); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M9 ^ cst2); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M8 ^ cstB); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M2 ^ cst9); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MB ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M7 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (MD ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M7 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MC ^ cst1); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M3 ^ cst9); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M1 ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M9 ^ cst3); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (ME ^ cst7); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MB ^ cstD); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M5 ^ cst0); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MF ^ cst4); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M8 ^ cst6); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M2 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M6 ^ cst8); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MA ^ cst2); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M4 ^ cstF); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M0 ^ cst5); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M6 ^ cstF); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (ME ^ cst9); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MB ^ cst3); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M0 ^ cst8); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M3 ^ cstB); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M8 ^ cst0); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MF ^ cst6); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MC ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MD ^ cst7); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M1 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MA ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M4 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M5 ^ cstA); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M7 ^ cstD); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M2 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (MA ^ cst2); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M7 ^ cst6); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M1 ^ cst5); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M6 ^ cst7); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M5 ^ cst1); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M2 ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MF ^ cstB); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M9 ^ cstE); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M3 ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MD ^ cst0); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MC ^ cst3); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M0 ^ cstD); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (ME ^ cst9); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MB ^ cstF); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M0 ^ cst1); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M2 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M4 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (M6 ^ cst7); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M5 ^ cst4); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M7 ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M3 ^ cst2); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M1 ^ cst0); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M8 ^ cst9); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (MA ^ cstB); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (ME ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (MF ^ cstE); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MB ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M9 ^ cst8); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (ME ^ cstA); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M4 ^ cst8); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M9 ^ cstF); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MD ^ cst6); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MF ^ cst9); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (M6 ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M8 ^ cst4); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M1 ^ cstC); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M0 ^ cst2); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (MB ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M5 ^ cst3); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M7 ^ cstB); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M3 ^ cst5); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M2 ^ cst0); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (MC ^ cst1); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (MB ^ cst8); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (MC ^ cst0); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (M5 ^ cst2); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MF ^ cstD); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (M2 ^ cst5); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (MD ^ cstF); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M0 ^ cstC); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M8 ^ cstB); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (MA ^ cstE); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M3 ^ cst6); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M7 ^ cst1); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (M9 ^ cst4); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M1 ^ cst7); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M4 ^ cst9); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (M6 ^ cst3); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (ME ^ cstA); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);
-	V0 = V0 + (M7 ^ cst9); V0 = V0 + V4; VC = VC ^ V0; VC = SWAP(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 12U); V1 = V1 + (M3 ^ cst1); V1 = V1 + V5; VD = VD ^ V1; VD = SWAP(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 12U); V2 = V2 + (MD ^ cstC); V2 = V2 + V6; VE = VE ^ V2; VE = SWAP(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 12U); V3 = V3 + (MB ^ cstE); V3 = V3 + V7; VF = VF ^ V3; VF = SWAP(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 12U); V2 = V2 + (MC ^ cstD); V2 = V2 + V6; VE = VE ^ V2; VE = ROTR8(VE); VA = VA + VE; V6 = V6 ^ VA; V6 = ROTR(V6, 7U); V3 = V3 + (ME ^ cstB); V3 = V3 + V7; VF = VF ^ V3; VF = ROTR8(VF); VB = VB + VF; V7 = V7 ^ VB; V7 = ROTR(V7, 7U); V1 = V1 + (M1 ^ cst3); V1 = V1 + V5; VD = VD ^ V1; VD = ROTR8(VD); V9 = V9 + VD; V5 = V5 ^ V9; V5 = ROTR(V5, 7U); V0 = V0 + (M9 ^ cst7); V0 = V0 + V4; VC = VC ^ V0; VC = ROTR8(VC); V8 = V8 + VC; V4 = V4 ^ V8; V4 = ROTR(V4, 7U); V0 = V0 + (M2 ^ cst6); V0 = V0 + V5; VF = VF ^ V0; VF = SWAP(VF); VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 12U); V1 = V1 + (M5 ^ cstA); V1 = V1 + V6; VC = VC ^ V1; VC = SWAP(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 12U); V2 = V2 + (M4 ^ cst0); V2 = V2 + V7; VD = VD ^ V2; VD = SWAP(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 12U); V3 = V3 + (MF ^ cst8); V3 = V3 + V4; VE = VE ^ V3; VE = SWAP(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 12U); V2 = V2 + (M0 ^ cst4); V2 = V2 + V7; VD = VD ^ V2; VD = ROTR8(VD); V8 = V8 + VD; V7 = V7 ^ V8; V7 = ROTR(V7, 7U); V3 = V3 + (M8 ^ cstF); V3 = V3 + V4; VE = VE ^ V3; VE = ROTR8(VE); V9 = V9 + VE; V4 = V4 ^ V9; V4 = ROTR(V4, 7U); V1 = V1 + (MA ^ cst5); V1 = V1 + V6; VC = VC ^ V1; VC = ROTR8(VC); VB = VB + VC; V6 = V6 ^ VB; V6 = ROTR(V6, 7U); V0 = V0 + (M6 ^ cst2); V0 = V0 + V5; VF = VF ^ V0; VF = ROTR8(VF);/*VA = VA + VF; V5 = V5 ^ VA; V5 = ROTR(V5, 7U);*/
-
-	/* The final chunks of the hash
-	 * are calculated as:
-	 * h0 = h0 ^ V0 ^ V8;
-	 * h1 = h1 ^ V1 ^ V9;
-	 * h2 = h2 ^ V2 ^ VA;
-	 * h3 = h3 ^ V3 ^ VB;
-	 * h4 = h4 ^ V4 ^ VC;
-	 * h5 = h5 ^ V5 ^ VD;
-	 * h6 = h6 ^ V6 ^ VE;
-	 * h7 = h7 ^ V7 ^ VF;
-	 *
-	 * We just check if the last byte
-	 * is zeroed and if it is, we tell
-	 * cgminer that we've found a
-	 * and to check it against the
-	 * target.
-	*/
-
-	/* Debug code to help you assess the correctness
-	 * of your hashing function in case someone decides
-	 * to try to optimize.
-	if (!((pre7 ^ V7 ^ VF) & 0xFFFF0000)) {
-		printf("hash on gpu %x %x %x %x %x %x %x %x\n",
-			h0 ^ V0 ^ V8,
-			h1 ^ V1 ^ V9,
-			h2 ^ V2 ^ VA,
-			h3 ^ V3 ^ VB,
-			h4 ^ V4 ^ VC,
-			h5 ^ V5 ^ VD,
-			h6 ^ V6 ^ VE,
-			h7 ^ V7 ^ VF);
-		printf("nonce for hash on gpu %x\n",
-			nonce);
-	}
-	*/
-
-        // Push this share.
-	if (pre7 ^ V7 ^ VF) return;
-        
-	// Update nonce.
-	output[++output[0]] = M3;
-}
diff --git a/blake256/blake256block.go b/blake256/blake256block.go
deleted file mode 100644
index d914052..0000000
--- a/blake256/blake256block.go
+++ /dev/null
@@ -1,1684 +0,0 @@
-// Written in 2011-2012 by Dmitry Chestnykh.
-//
-// To the extent possible under law, the author have dedicated all copyright
-// and related and neighboring rights to this software to the public domain
-// worldwide. This software is distributed without any warranty.
-// http://creativecommons.org/publicdomain/zero/1.0/
-
-// BLAKE-256 block step.
-// In its own file so that a faster assembly or C version
-// can be substituted easily.
-
-package blake256
-
-const (
-	cst0  = 0x243F6A88
-	cst1  = 0x85A308D3
-	cst2  = 0x13198A2E
-	cst3  = 0x03707344
-	cst4  = 0xA4093822
-	cst5  = 0x299F31D0
-	cst6  = 0x082EFA98
-	cst7  = 0xEC4E6C89
-	cst8  = 0x452821E6
-	cst9  = 0x38D01377
-	cst10 = 0xBE5466CF
-	cst11 = 0x34E90C6C
-	cst12 = 0xC0AC29B7
-	cst13 = 0xC97C50DD
-	cst14 = 0x3F84D5B5
-	cst15 = 0xB5470917
-)
-
-var IV256 = [8]uint32{
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19}
-
-// Block computes a blake256 block, updating the state in 'h' with the data
-// from the block in 'p', assuming a zero salt.
-// h must be 8 uint32's
-// p must be 64 bytes.
-func Block(h []uint32, p []uint8, t uint64) {
-	h0, h1, h2, h3, h4, h5, h6, h7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
-
-	v0, v1, v2, v3, v4, v5, v6, v7 := h0, h1, h2, h3, h4, h5, h6, h7
-
-	v8 := uint32(cst0)
-	v9 := uint32(cst1)
-	v10 := uint32(cst2)
-	v11 := uint32(cst3)
-	v12 := uint32(cst4)
-	v13 := uint32(cst5)
-	v14 := uint32(cst6)
-	v15 := uint32(cst7)
-	v12 ^= uint32(t)
-	v13 ^= uint32(t)
-	v14 ^= uint32(t >> 32)
-	v15 ^= uint32(t >> 32)
-
-	var m [16]uint32
-
-	m[0] = uint32(p[0])<<24 | uint32(p[1])<<16 | uint32(p[2])<<8 | uint32(p[3])
-	m[1] = uint32(p[4])<<24 | uint32(p[5])<<16 | uint32(p[6])<<8 | uint32(p[7])
-	m[2] = uint32(p[8])<<24 | uint32(p[9])<<16 | uint32(p[10])<<8 | uint32(p[11])
-	m[3] = uint32(p[12])<<24 | uint32(p[13])<<16 | uint32(p[14])<<8 | uint32(p[15])
-	m[4] = uint32(p[16])<<24 | uint32(p[17])<<16 | uint32(p[18])<<8 | uint32(p[19])
-	m[5] = uint32(p[20])<<24 | uint32(p[21])<<16 | uint32(p[22])<<8 | uint32(p[23])
-	m[6] = uint32(p[24])<<24 | uint32(p[25])<<16 | uint32(p[26])<<8 | uint32(p[27])
-	m[7] = uint32(p[28])<<24 | uint32(p[29])<<16 | uint32(p[30])<<8 | uint32(p[31])
-	m[8] = uint32(p[32])<<24 | uint32(p[33])<<16 | uint32(p[34])<<8 | uint32(p[35])
-	m[9] = uint32(p[36])<<24 | uint32(p[37])<<16 | uint32(p[38])<<8 | uint32(p[39])
-	m[10] = uint32(p[40])<<24 | uint32(p[41])<<16 | uint32(p[42])<<8 | uint32(p[43])
-	m[11] = uint32(p[44])<<24 | uint32(p[45])<<16 | uint32(p[46])<<8 | uint32(p[47])
-	m[12] = uint32(p[48])<<24 | uint32(p[49])<<16 | uint32(p[50])<<8 | uint32(p[51])
-	m[13] = uint32(p[52])<<24 | uint32(p[53])<<16 | uint32(p[54])<<8 | uint32(p[55])
-	m[14] = uint32(p[56])<<24 | uint32(p[57])<<16 | uint32(p[58])<<8 | uint32(p[59])
-	m[15] = uint32(p[60])<<24 | uint32(p[61])<<16 | uint32(p[62])<<8 | uint32(p[63])
-
-	// Round 1.
-	v0 += m[0] ^ cst1
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[2] ^ cst3
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[4] ^ cst5
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[6] ^ cst7
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[5] ^ cst4
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[7] ^ cst6
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[3] ^ cst2
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[1] ^ cst0
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[8] ^ cst9
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[10] ^ cst11
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[12] ^ cst13
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[14] ^ cst15
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[13] ^ cst12
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[15] ^ cst14
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[11] ^ cst10
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[9] ^ cst8
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 2.
-	v0 += m[14] ^ cst10
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[4] ^ cst8
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[9] ^ cst15
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[13] ^ cst6
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[15] ^ cst9
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[6] ^ cst13
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[8] ^ cst4
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[10] ^ cst14
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[1] ^ cst12
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[0] ^ cst2
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[11] ^ cst7
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[5] ^ cst3
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[7] ^ cst11
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[3] ^ cst5
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[2] ^ cst0
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[12] ^ cst1
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 3.
-	v0 += m[11] ^ cst8
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[12] ^ cst0
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[5] ^ cst2
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[15] ^ cst13
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[2] ^ cst5
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[13] ^ cst15
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[0] ^ cst12
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[8] ^ cst11
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[10] ^ cst14
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[3] ^ cst6
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[7] ^ cst1
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[9] ^ cst4
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[1] ^ cst7
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[4] ^ cst9
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[6] ^ cst3
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[14] ^ cst10
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 4.
-	v0 += m[7] ^ cst9
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[3] ^ cst1
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[13] ^ cst12
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[11] ^ cst14
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[12] ^ cst13
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[14] ^ cst11
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[1] ^ cst3
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[9] ^ cst7
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[2] ^ cst6
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[5] ^ cst10
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[4] ^ cst0
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[15] ^ cst8
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[0] ^ cst4
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[8] ^ cst15
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[10] ^ cst5
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[6] ^ cst2
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 5.
-	v0 += m[9] ^ cst0
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[5] ^ cst7
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[2] ^ cst4
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[10] ^ cst15
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[4] ^ cst2
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[15] ^ cst10
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[7] ^ cst5
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[0] ^ cst9
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[14] ^ cst1
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[11] ^ cst12
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[6] ^ cst8
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[3] ^ cst13
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[8] ^ cst6
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[13] ^ cst3
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[12] ^ cst11
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[1] ^ cst14
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 6.
-	v0 += m[2] ^ cst12
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[6] ^ cst10
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[0] ^ cst11
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[8] ^ cst3
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[11] ^ cst0
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[3] ^ cst8
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[10] ^ cst6
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[12] ^ cst2
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[4] ^ cst13
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[7] ^ cst5
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[15] ^ cst14
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[1] ^ cst9
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[14] ^ cst15
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[9] ^ cst1
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[5] ^ cst7
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[13] ^ cst4
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 7.
-	v0 += m[12] ^ cst5
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[1] ^ cst15
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[14] ^ cst13
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[4] ^ cst10
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[13] ^ cst14
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[10] ^ cst4
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[15] ^ cst1
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[5] ^ cst12
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[0] ^ cst7
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[6] ^ cst3
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[9] ^ cst2
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[8] ^ cst11
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[2] ^ cst9
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[11] ^ cst8
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[3] ^ cst6
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[7] ^ cst0
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 8.
-	v0 += m[13] ^ cst11
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[7] ^ cst14
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[12] ^ cst1
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[3] ^ cst9
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[1] ^ cst12
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[9] ^ cst3
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[14] ^ cst7
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[11] ^ cst13
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[5] ^ cst0
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[15] ^ cst4
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[8] ^ cst6
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[2] ^ cst10
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[6] ^ cst8
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[10] ^ cst2
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[4] ^ cst15
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[0] ^ cst5
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 9.
-	v0 += m[6] ^ cst15
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[14] ^ cst9
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[11] ^ cst3
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[0] ^ cst8
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[3] ^ cst11
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[8] ^ cst0
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[9] ^ cst14
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[15] ^ cst6
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[12] ^ cst2
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[13] ^ cst7
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[1] ^ cst4
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[10] ^ cst5
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[4] ^ cst1
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[5] ^ cst10
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[7] ^ cst13
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[2] ^ cst12
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 10.
-	v0 += m[10] ^ cst2
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[8] ^ cst4
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[7] ^ cst6
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[1] ^ cst5
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[6] ^ cst7
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[5] ^ cst1
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[4] ^ cst8
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[2] ^ cst10
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[15] ^ cst11
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[9] ^ cst14
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[3] ^ cst12
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[13] ^ cst0
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[12] ^ cst3
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[0] ^ cst13
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[14] ^ cst9
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[11] ^ cst15
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 11.
-	v0 += m[0] ^ cst1
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[2] ^ cst3
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[4] ^ cst5
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[6] ^ cst7
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[5] ^ cst4
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[7] ^ cst6
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[3] ^ cst2
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[1] ^ cst0
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[8] ^ cst9
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[10] ^ cst11
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[12] ^ cst13
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[14] ^ cst15
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[13] ^ cst12
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[15] ^ cst14
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[11] ^ cst10
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[9] ^ cst8
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 12.
-	v0 += m[14] ^ cst10
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[4] ^ cst8
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[9] ^ cst15
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[13] ^ cst6
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[15] ^ cst9
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[6] ^ cst13
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[8] ^ cst4
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[10] ^ cst14
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[1] ^ cst12
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[0] ^ cst2
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[11] ^ cst7
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[5] ^ cst3
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[7] ^ cst11
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[3] ^ cst5
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[2] ^ cst0
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[12] ^ cst1
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 13.
-	v0 += m[11] ^ cst8
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[12] ^ cst0
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[5] ^ cst2
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[15] ^ cst13
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[2] ^ cst5
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[13] ^ cst15
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[0] ^ cst12
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[8] ^ cst11
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[10] ^ cst14
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[3] ^ cst6
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[7] ^ cst1
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[9] ^ cst4
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[1] ^ cst7
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[4] ^ cst9
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[6] ^ cst3
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[14] ^ cst10
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	// Round 14.
-	v0 += m[7] ^ cst9
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-16) | v12>>16
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-12) | v4>>12
-	v1 += m[3] ^ cst1
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-16) | v13>>16
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-12) | v5>>12
-	v2 += m[13] ^ cst12
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-16) | v14>>16
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-12) | v6>>12
-	v3 += m[11] ^ cst14
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-16) | v15>>16
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-12) | v7>>12
-	v2 += m[12] ^ cst13
-	v2 += v6
-	v14 ^= v2
-	v14 = v14<<(32-8) | v14>>8
-	v10 += v14
-	v6 ^= v10
-	v6 = v6<<(32-7) | v6>>7
-	v3 += m[14] ^ cst11
-	v3 += v7
-	v15 ^= v3
-	v15 = v15<<(32-8) | v15>>8
-	v11 += v15
-	v7 ^= v11
-	v7 = v7<<(32-7) | v7>>7
-	v1 += m[1] ^ cst3
-	v1 += v5
-	v13 ^= v1
-	v13 = v13<<(32-8) | v13>>8
-	v9 += v13
-	v5 ^= v9
-	v5 = v5<<(32-7) | v5>>7
-	v0 += m[9] ^ cst7
-	v0 += v4
-	v12 ^= v0
-	v12 = v12<<(32-8) | v12>>8
-	v8 += v12
-	v4 ^= v8
-	v4 = v4<<(32-7) | v4>>7
-	v0 += m[2] ^ cst6
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-16) | v15>>16
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-12) | v5>>12
-	v1 += m[5] ^ cst10
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-16) | v12>>16
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-12) | v6>>12
-	v2 += m[4] ^ cst0
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-16) | v13>>16
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-12) | v7>>12
-	v3 += m[15] ^ cst8
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-16) | v14>>16
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-12) | v4>>12
-	v2 += m[0] ^ cst4
-	v2 += v7
-	v13 ^= v2
-	v13 = v13<<(32-8) | v13>>8
-	v8 += v13
-	v7 ^= v8
-	v7 = v7<<(32-7) | v7>>7
-	v3 += m[8] ^ cst15
-	v3 += v4
-	v14 ^= v3
-	v14 = v14<<(32-8) | v14>>8
-	v9 += v14
-	v4 ^= v9
-	v4 = v4<<(32-7) | v4>>7
-	v1 += m[10] ^ cst5
-	v1 += v6
-	v12 ^= v1
-	v12 = v12<<(32-8) | v12>>8
-	v11 += v12
-	v6 ^= v11
-	v6 = v6<<(32-7) | v6>>7
-	v0 += m[6] ^ cst2
-	v0 += v5
-	v15 ^= v0
-	v15 = v15<<(32-8) | v15>>8
-	v10 += v15
-	v5 ^= v10
-	v5 = v5<<(32-7) | v5>>7
-
-	h0 ^= v0 ^ v8
-	h1 ^= v1 ^ v9
-	h2 ^= v2 ^ v10
-	h3 ^= v3 ^ v11
-	h4 ^= v4 ^ v12
-	h5 ^= v5 ^ v13
-	h6 ^= v6 ^ v14
-	h7 ^= v7 ^ v15
-
-	h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] = h0, h1, h2, h3, h4, h5, h6, h7
-}
diff --git a/blake3.cl b/blake3.cl
new file mode 100644
index 0000000..cfc7c30
--- /dev/null
+++ b/blake3.cl
@@ -0,0 +1,159 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Decred BLAKE3 midstate-based kernel
+
+// Written and optimized by Dave Collins Sep 2023.
+//
+// AMD RX 580         - 3.68 Gh/s
+// AMD Vega 56        - 7.00 Gh/s
+// NVIDIA RTX 4070    - 14.85 Gh/s
+// NVIDIA Tesla V100  - 13.89 Gh/s
+// NVIDIA Tesla V100S - 14.60 Gh/s
+
+#define ROTR(v, n) rotate(v, (uint)(32U - n))
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void
+search(
+    volatile __global uint *restrict output,
+    // Midstate.
+    const uint cv0,
+    const uint cv1,
+    const uint cv2,
+    const uint cv3,
+    const uint cv4,
+    const uint cv5,
+    const uint cv6,
+    const uint cv7,
+
+    // Final 52 bytes of data.
+    const uint m0,
+    const uint m1,
+    const uint m2,
+    // const uint m3 : nonce
+    const uint m4,
+    const uint m5,
+    const uint m6,
+    const uint m7,
+    const uint m8,
+    const uint m9,
+    const uint m10,
+    const uint m11,
+    const uint m12)
+{
+    // Nonce.
+    const uint m3 = get_global_id(0);
+
+    // BLAKE3 init vectors.
+    const uint iv0 = 0x6a09e667ul;
+    const uint iv1 = 0xbb67ae85ul;
+    const uint iv2 = 0x3c6ef372ul;
+    const uint iv3 = 0xa54ff53aul;
+    const uint iv4 = 0x510e527ful;
+    const uint iv5 = 0x9b05688cul;
+    const uint iv6 = 0x1f83d9abul;
+    const uint iv7 = 0x5be0cd19ul;
+
+    // Internal compression func state.
+    uint v0, v1, v2, v3, v4, v5, v6, v7;
+    uint v8, v9, v10, v11, v12, v13, v14, v15;
+
+    // Do the initialization and first round together.
+    // Round 1.
+    v0 = cv0 + cv4 + m0; v12 = ROTR(v0, 16);       v8 = iv0 + v12;  v4 = ROTR(cv4 ^ v8, 12);  v0 += v4 + m1;  v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = cv1 + cv5 + m2; v13 = ROTR(v1, 16);       v9 = iv1 + v13;  v5 = ROTR(cv5 ^ v9, 12);  v1 += v5 + m3;  v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = cv2 + cv6 + m4; v14 = ROTR(52 ^ v2, 16);  v10 = iv2 + v14; v6 = ROTR(cv6 ^ v10, 12); v2 += v6 + m5;  v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = cv3 + cv7 + m6; v15 = ROTR(10 ^ v3, 16);  v11 = iv3 + v15; v7 = ROTR(cv7 ^ v11, 12); v3 += v7 + m7;  v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m8;   v15 = ROTR(v15 ^ v0, 16); v10 += v15;      v5 = ROTR(v5 ^ v10, 12);  v0 += v5 + m9;  v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m10;  v12 = ROTR(v12 ^ v1, 16); v11 += v12;      v6 = ROTR(v6 ^ v11, 12);  v1 += v6 + m11; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m12;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;       v7 = ROTR(v7 ^ v8, 12);   v2 += v7;       v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4;        v14 = ROTR(v14 ^ v3, 16); v9 += v14;       v4 = ROTR(v4 ^ v9, 12);   v3 += v4;       v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 2 with message word permutation.
+    v0 = v0 + v4 + m2;  v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4 + m6;  v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m3;  v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m10; v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m7;  v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m0;  v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m4;  v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7;       v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m1;  v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m11; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m12; v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m5;  v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m9;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7;       v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4;       v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4 + m8;  v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 3 with message word permutation.
+    v0 = v0 + v4 + m3;  v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4 + m4;  v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m10; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m12; v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6;       v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m2;  v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m7;  v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7;       v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m6;  v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m5;  v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m9;  v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m0;  v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m11; v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7;       v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m8;  v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4 + m1;  v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 4 with message word permutation.
+    v0 = v0 + v4 + m10; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4 + m7;  v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m12; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m9;  v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6;       v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m3;  v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7;       v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7;       v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m4;  v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m0;  v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m11; v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m2;  v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m5;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7 + m8;  v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m1;  v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4 + m6;  v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 5 with message word permutation.
+    v0 = v0 + v4 + m12; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4;       v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m9;  v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m11; v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6;       v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m10; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7;       v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m8;  v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m7;  v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m2;  v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m5;  v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m3;  v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m0;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7 + m1;  v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m6;  v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4 + m4;  v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 6 with message word permutation.
+    v0 = v0 + v4 + m9;  v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4;       v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m11; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m5;  v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m8;  v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m12; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7;       v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m1;  v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5;       v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m3;  v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m0;  v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m10; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m2;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7 + m6;  v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m4;  v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4 + m7;  v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 7 with message word permutation.
+    v0 = v0 + v4 + m11; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12);  v0 += v4;       v12 = ROTR(v12 ^ v0, 8); v8 += v12;  v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m5;  v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12);  v1 += v5 + m0;  v13 = ROTR(v13 ^ v1, 8); v9 += v13;  v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m1;  v14 = ROTR(v14 ^ v2, 16); v10 += v14;    v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m9;  v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m8;  v15 = ROTR(v15 ^ v3, 16); v11 += v15;    v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m6;  v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5;       v15 = ROTR(v15 ^ v0, 16); v10 += v15;    v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m10; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m2;  v12 = ROTR(v12 ^ v1, 16); v11 += v12;    v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m12; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m3;  v13 = ROTR(v13 ^ v2, 16); v8 += v13;     v7 = ROTR(v7 ^ v8, 12);  v2 += v7 + m4;  v13 = ROTR(v13 ^ v2, 8); v8 += v13;  v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m7;  v14 = ROTR(v14 ^ v3, 16); v9 += v14;     v4 = ROTR(v4 ^ v9, 12);  v3 += v4;       v14 = ROTR(v14 ^ v3, 8); v9 += v14;  v4 = ROTR(v4 ^ v9, 7);
+
+    // Finally the truncated 256-bit output is defined as:
+    //
+    // h'0 = v0^v8
+    // h'1 = v1^v9
+    // h'2 = v2^v10
+    // h'3 = v3^v11
+    // h'4 = v4^v12
+    // h'5 = v5^v13
+    // h'6 = v6^v14
+    // h'7 = v7^v15
+    //
+    // Only notify the miner that a potential solution was found when the last
+    // word (32 bits) is zeroed so it can check against the target difficulty.
+
+    // Debug code to print result of hashing function.
+    // if (!((v7 ^ v15) & 0xffff0000)) {
+    //     printf("hash on gpu %x %x %x %x %x %x %x %x\n",
+    //         v0 ^ v8, v1 ^ v9, v2 ^ v10, v3 ^ v11,
+    //         v4 ^ v12, v5 ^ v13, v6 ^ v14, v7 ^ v15);
+    //     printf("nonce for hash on gpu %x\n", m3);
+    // }
+
+    if (v7 ^ v15)
+        return;
+
+    // Update nonce.
+    output[++output[0]] = m3;
+}
diff --git a/blake3/block.go b/blake3/block.go
new file mode 100644
index 0000000..ff2d224
--- /dev/null
+++ b/blake3/block.go
@@ -0,0 +1,208 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Decred BLAKE3 midstate-based kernel
+//
+// Written and optimized by Dave Collins Aug 2023.
+
+// Package blake3 provides a minimal implementation of BLAKE3 that accepts
+// midstates and is tailored specifically to Decred.
+package blake3
+
+import (
+	"math/bits"
+)
+
+// BLAKE3 domain separation flags.
+const (
+	FlagChunkStart = 1
+	flagChunkEnd   = 2
+	flagRoot       = 8
+)
+
+// IV is the BLAKE3 initialization vector.
+var IV = [8]uint32{
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+}
+
+// g is the quarter round function that each round applies to the 4x4 internal
+// state in the compression function.
+func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
+	a += b + mx
+	d = bits.RotateLeft32(d^a, -16)
+	c += d
+	b = bits.RotateLeft32(b^c, -12)
+	a += b + my
+	d = bits.RotateLeft32(d^a, -8)
+	c += d
+	b = bits.RotateLeft32(b^c, -7)
+	return a, b, c, d
+}
+
+// compress is a stripped down version of the BLAKE3 node compression function
+// that will only work properly with a single chunk and truncates the output to
+// 256 bits.
+func compress(cv [8]uint32, block [16]uint32, blockLen, flags uint32) [8]uint32 {
+	// The compression func initializes the 16-word internal state as follows:
+	//
+	// h0..h7 is the input chaining value (cv).
+	//
+	// iv0..iv3 are the first 4 words of the constant initialization vector.
+	//
+	// t0 and t1 are the lower and higher order words of a 64-bit counter, but
+	// since only a single chunk is ever hashed in this stripped down version,
+	// it's always zero here.
+	//
+	// b is the number of input bytes in the block (blockLen).
+	//
+	// d is the domain separation bit flags (flags).
+	//
+	// |v0  v1  v2  v3 |   |h0  h1  h2  h3 |
+	// |v4  v5  v6  v7 |   |h4  h5  h6  h7 |
+	// |v8  v9  v10 v11| = |iv0 iv1 iv2 iv3|
+	// |v12 v13 v14 v15|   |t0  t1  b   d  |
+	//
+	// Each round consists of 8 applications of the G function as follows:
+	// G0(v0,v4,v8,v12)   G1(v1,v5,v9,v13)   G2(v2,v6,v10,v14)  G3(v3,v7,v11,v15)
+	// G4(v0,v5,v10,v15)  G5(v1,v6,v11,v12)  G6(v2,v7,v8,v13)   G7(v3,v4,v9,v14)
+	//
+	// In other words, the G function is applied to each column of the 4x4 state
+	// and then to each of the diagonals.
+	//
+	// In addition, after each of the first 6 rounds, the message words are
+	// permuted according to the following table:
+	//
+	// 0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
+	// 2  6  3  10 7  0  4  13 1  11 12 5  9  14 15 8
+
+	// Do the initialization and first round together.
+	v0, v4, v8, v12 := g(cv[0], cv[4], IV[0], 0, block[0], block[1])
+	v1, v5, v9, v13 := g(cv[1], cv[5], IV[1], 0, block[2], block[3])
+	v2, v6, v10, v14 := g(cv[2], cv[6], IV[2], blockLen, block[4], block[5])
+	v3, v7, v11, v15 := g(cv[3], cv[7], IV[3], flags, block[6], block[7])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[8], block[9])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[10], block[11])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[12], block[13])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[14], block[15])
+
+	// 2nd round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[2], block[6])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[3], block[10])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[7], block[0])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[4], block[13])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[1], block[11])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[12], block[5])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[9], block[14])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[15], block[8])
+
+	// 3rd round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[3], block[4])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[10], block[12])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[13], block[2])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[7], block[14])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[6], block[5])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[9], block[0])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[11], block[15])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[8], block[1])
+
+	// 4th round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[10], block[7])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[12], block[9])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[14], block[3])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[13], block[15])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[4], block[0])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[11], block[2])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[5], block[8])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[1], block[6])
+
+	// 5th round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[12], block[13])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[9], block[11])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[15], block[10])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[14], block[8])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[7], block[2])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[5], block[3])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[0], block[1])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[6], block[4])
+
+	// 6th round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[9], block[14])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[11], block[5])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[8], block[12])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[15], block[1])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[13], block[3])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[0], block[10])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[2], block[6])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[4], block[7])
+
+	// 7th round with message word permutation.
+	v0, v4, v8, v12 = g(v0, v4, v8, v12, block[11], block[15])
+	v1, v5, v9, v13 = g(v1, v5, v9, v13, block[5], block[0])
+	v2, v6, v10, v14 = g(v2, v6, v10, v14, block[1], block[9])
+	v3, v7, v11, v15 = g(v3, v7, v11, v15, block[8], block[6])
+	v0, v5, v10, v15 = g(v0, v5, v10, v15, block[14], block[10])
+	v1, v6, v11, v12 = g(v1, v6, v11, v12, block[2], block[12])
+	v2, v7, v8, v13 = g(v2, v7, v8, v13, block[3], block[4])
+	v3, v4, v9, v14 = g(v3, v4, v9, v14, block[7], block[13])
+
+	// Finally the output is defined as:
+	//
+	// h'0 = v0^v8   h'8  = v8^h0
+	// h'1 = v1^v9   h'9  = v9^h1
+	// h'2 = v2^v10  h'10 = v10^h2
+	// h'3 = v3^v11  h'11 = v11^h3
+	// h'4 = v4^v12  h'12 = v12^h4
+	// h'5 = v5^v13  h'13 = v13^h5
+	// h'6 = v6^v14  h'14 = v14^h6
+	// h'7 = v7^v15  h'15 = v15^h7
+	//
+	// However, the upper results are ignored since only the first 256 bits are
+	// needed in this stripped down version.
+	return [8]uint32{
+		0: v0 ^ v8,
+		1: v1 ^ v9,
+		2: v2 ^ v10,
+		3: v3 ^ v11,
+		4: v4 ^ v12,
+		5: v5 ^ v13,
+		6: v6 ^ v14,
+		7: v7 ^ v15,
+	}
+}
+
+// block runs a single iteration of the BLAKE3 block compression function on the
+// provided block in b using the provided domain-specific flags and returns the
+// 256-bit truncated state to be chained into the next iteration for the next
+// block of data (aka the midstate).
+//
+// The data in b MUST NOT exceed 64 bytes for a correct result.
+func block(midstate [8]uint32, b []byte, flags uint32) [8]uint32 {
+	var block [16]uint32
+	intoWords(&block, b)
+	return compress(midstate, block, uint32(len(b)), flags)
+}
+
+// Block runs a single iteration of the BLAKE3 block compression function on the
+// provided 64-byte block in b using the provided domain-specific flags and
+// returns the 256-bit truncated state to be chained into the next iteration for
+// the next block of data (aka the midstate).
+//
+// The data in b MUST be 64 bytes.  The first iteration must pass the exported
+// IV for the midstate and FlagChunkStart for the flags to signal the start of
+// the chunk, while the second must pass the midstate returned from the first
+// iteration as well as 0 for the flags.
+//
+// This function is purpose built with the expectation that it will only be
+// run on two blocks.  It is not guaranteed to produce correct results in other
+// scenarios.
+func Block(midstate [8]uint32, b []byte, flags uint32) [8]uint32 {
+	return block(midstate, b, flags)
+}
+
+// FinalBlock returns the finalized BLAKE3 hash for the given midstate and
+// provided portion of the final block to hash.
+//
+// The data in b MUST NOT exceed 64 bytes for a correct result.
+func FinalBlock(midstate [8]uint32, b []byte) [32]byte {
+	return asBytes(block(midstate, b, flagChunkEnd|flagRoot))
+}
diff --git a/blake3/convert_amd64.go b/blake3/convert_amd64.go
new file mode 100644
index 0000000..5036c28
--- /dev/null
+++ b/blake3/convert_amd64.go
@@ -0,0 +1,21 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Written and optimized by Dave Collins Aug 2023.
+
+package blake3
+
+import "unsafe"
+
+// intoWords writes the provided data in b to the provided array of uint32
+// words.
+//
+// The data in b MUST NOT exceed 64 bytes for a correct result.
+func intoWords(words *[16]uint32, b []byte) {
+	wordBytes := (*[64]byte)(unsafe.Pointer(words))[:]
+	copy(wordBytes, b)
+}
+
+// asBytes converts the provided array of uint32 words into bytes.
+func asBytes(cv [8]uint32) [32]byte {
+	return *(*[32]byte)(unsafe.Pointer(&cv))
+}
diff --git a/blake3/convert_generic.go b/blake3/convert_generic.go
new file mode 100644
index 0000000..5298ed9
--- /dev/null
+++ b/blake3/convert_generic.go
@@ -0,0 +1,33 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Written and optimized by Dave Collins Aug 2023.
+
+//go:build !amd64
+// +build !amd64
+
+package blake3
+
+import (
+	"encoding/binary"
+)
+
+// intoWords writes the provided data in b to the provided array of uint32
+// words.
+//
+// The data in b MUST NOT exceed 64 bytes for a correct result.
+func intoWords(words *[16]uint32, b []byte) {
+	var block [64]byte
+	copy(block[:], b)
+	for i := range words {
+		words[i] = binary.LittleEndian.Uint32(block[4*i:])
+	}
+}
+
+// asBytes converts the provided array of uint32 words into bytes.
+func asBytes(cv [8]uint32) [32]byte {
+	var b [32]byte
+	for i, v := range cv {
+		binary.LittleEndian.PutUint32(b[4*i:], v)
+	}
+	return b
+}
diff --git a/calibrate.go b/calibrate.go
index 7e61c15..6faa428 100644
--- a/calibrate.go
+++ b/calibrate.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 //go:build !cuda
 // +build !cuda
@@ -107,7 +107,7 @@ func (d *Device) calcWorkSizeForMilliseconds(ms int) (uint32, error) {
 
 		// If we fail to go above the desired execution time, double
 		// the work size and try again.
-		if execTime < timeToAchieve {
+		if execTime < timeToAchieve && workSize < 1<<30 {
 			workSize <<= 1
 			continue
 		}
diff --git a/cl/cl.h b/cl/cl.h
index e2d8943..ed86743 100644
--- a/cl/cl.h
+++ b/cl/cl.h
@@ -4,6 +4,7 @@
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#define CL_TARGET_OPENCL_VERSION 220
 
 #ifdef __APPLE__
 #include "OpenCL/opencl.h"
diff --git a/cladldevice.go b/cladldevice.go
index 1fc1a86..9d17216 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 //go:build opencladl && !cuda && !opencl
 // +build opencladl,!cuda,!opencl
@@ -214,9 +214,12 @@ func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
 	var numDevices cl.CL_uint
 	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
 		&numDevices)
-	if status != cl.CL_SUCCESS {
+	if status != cl.CL_SUCCESS && status != cl.CL_DEVICE_NOT_FOUND {
 		return nil, clError(status, "CLGetDeviceIDs")
 	}
+	if numDevices == 0 {
+		return nil, nil
+	}
 	devices := make([]cl.CL_device_id, numDevices)
 	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
 		devices, nil)
@@ -460,12 +463,12 @@ func (d *Device) runDevice() error {
 	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 
-	// Bump the extraNonce for the device it's running on
-	// when you begin mining. This ensures each device is doing
-	// different work. If the extraNonce has already been
-	// set for valid work, restore that.
-	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+	// Initialize the nonces for the device such that each device in the same
+	// system is doing different work while also helping prevent collisions
+	// across multiple processes and systems working on the same template.
+	if err := d.initNonces(); err != nil {
+		return err
+	}
 
 	var status cl.CL_int
 	for {
@@ -479,7 +482,7 @@ func (d *Device) runDevice() error {
 
 		// Increment extraNonce.
 		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = d.extraNonce
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -488,7 +491,7 @@ func (d *Device) runDevice() error {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
-		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+		d.lastBlock[work.TimestampWord] = ts
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
@@ -556,8 +559,7 @@ func (d *Device) runDevice() error {
 			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				util.Uint32EndiannessSwap(d.currentWorkID),
-				d.lastBlock[work.TimestampWord])
+				d.currentWorkID, d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
@@ -613,7 +615,6 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 		}
 	}
 	return m, deviceListEnabledCount, nil
-
 }
 
 func getDeviceInfo(id cl.CL_device_id,
diff --git a/cldevice.go b/cldevice.go
index 6a28350..74d5d35 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 //go:build opencl && !cuda && !opencladl
 // +build opencl,!cuda,!opencladl
@@ -342,9 +342,12 @@ func getCLDevices(platform cl.CL_platform_id) ([]cl.CL_device_id, error) {
 	var numDevices cl.CL_uint
 	status := cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, 0, nil,
 		&numDevices)
-	if status != cl.CL_SUCCESS {
+	if status != cl.CL_SUCCESS && status != cl.CL_DEVICE_NOT_FOUND {
 		return nil, clError(status, "CLGetDeviceIDs")
 	}
+	if numDevices == 0 {
+		return nil, nil
+	}
 	devices := make([]cl.CL_device_id, numDevices)
 	status = cl.CLGetDeviceIDs(platform, cl.CL_DEVICE_TYPE_ALL, numDevices,
 		devices, nil)
@@ -589,12 +592,12 @@ func (d *Device) runDevice() error {
 	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 
-	// Bump the extraNonce for the device it's running on
-	// when you begin mining. This ensures each device is doing
-	// different work. If the extraNonce has already been
-	// set for valid work, restore that.
-	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+	// Initialize the nonces for the device such that each device in the same
+	// system is doing different work while also helping prevent collisions
+	// across multiple processes and systems working on the same template.
+	if err := d.initNonces(); err != nil {
+		return err
+	}
 
 	var status cl.CL_int
 	for {
@@ -608,7 +611,7 @@ func (d *Device) runDevice() error {
 
 		// Increment extraNonce.
 		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = d.extraNonce
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -617,7 +620,7 @@ func (d *Device) runDevice() error {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
-		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+		d.lastBlock[work.TimestampWord] = ts
 
 		// arg 0: pointer to the buffer
 		obuf := d.outputBuffer
@@ -685,8 +688,7 @@ func (d *Device) runDevice() error {
 			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				util.Uint32EndiannessSwap(d.currentWorkID),
-				d.lastBlock[work.TimestampWord])
+				d.currentWorkID, d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
@@ -742,7 +744,6 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 		}
 	}
 	return m, deviceListEnabledCount, nil
-
 }
 
 func getDeviceInfo(id cl.CL_device_id,
diff --git a/config.go b/config.go
index bbce270..07c00b7 100644
--- a/config.go
+++ b/config.go
@@ -1,5 +1,5 @@
 // Copyright (c) 2013-2015 The btcsuite developers
-// Copyright (c) 2015-2016 The Decred developers
+// Copyright (c) 2015-2023 The Decred developers
 
 package main
 
@@ -25,7 +25,7 @@ const (
 	defaultLogLevel       = "info"
 	defaultLogDirname     = "logs"
 	defaultLogFilename    = "gominer.log"
-	defaultClKernel       = "blake256.cl"
+	defaultClKernel       = "blake3.cl"
 )
 
 var (
diff --git a/cudevice.go b/cudevice.go
index 3d2b9c9..fe015c9 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 //go:build cuda && !opencl
 // +build cuda,!opencl
@@ -274,12 +274,12 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 }
 
 func (d *Device) runDevice() error {
-	// Bump the extraNonce for the device it's running on
-	// when you begin mining. This ensures each GPU is doing
-	// different work. If the extraNonce has already been
-	// set for valid work, restore that.
-	d.extraNonce += uint32(d.index) << 24
-	d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+	// Initialize the nonces for the device such that each device in the same
+	// system is doing different work while also helping prevent collisions
+	// across multiple processes and systems working on the same template.
+	if err := d.initNonces(); err != nil {
+		return err
+	}
 
 	// Need to have this stuff here for a device vs thread issue.
 	runtime.LockOSThread()
@@ -317,7 +317,7 @@ func (d *Device) runDevice() error {
 
 		// Increment extraNonce.
 		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = util.Uint32EndiannessSwap(d.extraNonce)
+		d.lastBlock[work.Nonce1Word] = d.extraNonce
 
 		copy(endianData[:], d.work.Data[:128])
 		for i, j := 128, 0; i < 180; {
@@ -336,7 +336,7 @@ func (d *Device) runDevice() error {
 			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
 			ts = d.work.JobTime + diffSeconds
 		}
-		d.lastBlock[work.TimestampWord] = util.Uint32EndiannessSwap(ts)
+		d.lastBlock[work.TimestampWord] = ts
 
 		nonceResultsHSlice[0] = 0
 
@@ -360,14 +360,10 @@ func (d *Device) runDevice() error {
 
 		numResults := nonceResultsHSlice[0]
 		for i, result := range nonceResultsHSlice[1 : 1+numResults] {
-			// lol seelog
-			i := i
-			result := result
 			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
 				"extraNonce %08x, workID %08x, timestamp %08x",
 				d.index, i, result, d.lastBlock[work.Nonce1Word],
-				util.Uint32EndiannessSwap(d.currentWorkID),
-				d.lastBlock[work.TimestampWord])
+				d.currentWorkID, d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
diff --git a/device.go b/device.go
index 465fe4e..ba3adab 100644
--- a/device.go
+++ b/device.go
@@ -1,23 +1,38 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 package main
 
 import (
+	"crypto/rand"
 	"encoding/binary"
-	"encoding/hex"
+	"fmt"
+	"io"
 	"sync/atomic"
 	"time"
 
 	"github.com/decred/dcrd/blockchain/standalone/v2"
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/chaincfg/v3"
-
-	"github.com/decred/gominer/blake256"
+	"github.com/decred/gominer/blake3"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
 var chainParams = chaincfg.MainNetParams()
+var deviceLibraryInitialized = false // nolint:unused
+
+// randDeviceOffset1 and randDeviceOffset2 are random offsets to use for all
+// devices so each process ends up with a random starting point for all devices.
+var randDeviceOffset1, randDeviceOffset2 uint8
+
+func init() {
+	var buf [2]byte
+	if _, err := io.ReadFull(rand.Reader, buf[:]); err != nil {
+		panic(err)
+	}
+	randDeviceOffset1 = buf[0]
+	randDeviceOffset2 = buf[1]
+}
 
 // Constants for fan and temperature bits
 const (
@@ -44,6 +59,43 @@ const (
 	TargetNone                = "None"
 )
 
+// initNonces initialize the nonces for the device such that each device in the
+// same system is doing different work while also helping prevent collisions
+// across multiple processes and systems working on the same template.
+func (d *Device) initNonces() error {
+	// Read cryptographically random data for use below in setting the initial
+	// nonces.
+	var buf [8]byte
+	if _, err := io.ReadFull(rand.Reader, buf[:]); err != nil {
+		return fmt.Errorf("unable to read random value: %w", err)
+	}
+	extraNonceRandOffset := binary.LittleEndian.Uint32(buf[0:])
+	extraNonce2RandOffset := binary.LittleEndian.Uint32(buf[4:])
+
+	// Set the initial extra nonce as follows:
+	// - The first byte is the device ID offset by the first per-process random
+	//   device offset
+	// - The remaining 3 bytes are a per-device random extra nonce offset
+	//
+	// This, when coupled with the second per-process random device offset set
+	// elsewhere, ensures each device in the same system is doing different work
+	// (up to 65536 devices) while also helping prevent collisions across
+	// multiple processes and systems working on the same template.
+	deviceOffset := (uint32(d.index) + uint32(randDeviceOffset1)) % 255
+	d.extraNonce = deviceOffset<<24 | extraNonceRandOffset&0x00ffffff
+
+	// Set the current work ID to a random initial value.
+	//
+	// The current work ID is also treated as a secondary extra nonce and thus,
+	// when combined with the extra nonce above, the result is that the pair
+	// effectively acts as an 8-byte randomized extra nonce.
+	d.currentWorkID = extraNonce2RandOffset
+
+	minrLog.Debugf("DEV #%d: initial extraNonce %x, initial workID: %x",
+		d.index, d.extraNonce, d.currentWorkID)
+	return nil
+}
+
 func (d *Device) updateCurrentWork() {
 	var w *work.Work
 	if d.hasWork {
@@ -67,28 +119,28 @@ func (d *Device) updateCurrentWork() {
 	d.hasWork = true
 
 	d.work = *w
-	minrLog.Tracef("pre-nonce: %v", hex.EncodeToString(d.work.Data[:]))
+	minrLog.Tracef("pre-nonce: %x", d.work.Data[:])
 
-	// Bump and set the work ID if the work is new.
+	// Bump and set the work ID.
 	d.currentWorkID++
 	binary.LittleEndian.PutUint32(d.work.Data[128+4*work.Nonce2Word:],
 		d.currentWorkID)
 
-	// Reset the hash state
-	copy(d.midstate[:], blake256.IV256[:])
+	// Set additional byte with the device id offset by a second per-process
+	// random device offset to support up to 65536 devices.
+	deviceID := uint8((uint32(d.index) + uint32(randDeviceOffset2)) % 255)
+	d.work.Data[128+4*work.Nonce3Word] = deviceID
 
-	// Hash the two first blocks
-	blake256.Block(d.midstate[:], d.work.Data[0:64], 512)
-	blake256.Block(d.midstate[:], d.work.Data[64:128], 1024)
-	minrLog.Tracef("midstate input data for work update %v",
-		hex.EncodeToString(d.work.Data[0:128]))
+	// Hash the two first blocks.
+	d.midstate = blake3.Block(blake3.IV, d.work.Data[0:64], blake3.FlagChunkStart)
+	d.midstate = blake3.Block(d.midstate, d.work.Data[64:128], 0)
+	minrLog.Tracef("midstate input data for work update %x", d.work.Data[0:128])
 
 	// Convert the next block to uint32 array.
 	for i := 0; i < 16; i++ {
-		d.lastBlock[i] = binary.BigEndian.Uint32(d.work.Data[128+i*4 : 132+i*4])
+		d.lastBlock[i] = binary.LittleEndian.Uint32(d.work.Data[128+i*4:])
 	}
-	minrLog.Tracef("work data for work update: %v",
-		hex.EncodeToString(d.work.Data[:]))
+	minrLog.Tracef("work data for work update: %x", d.work.Data)
 }
 
 func (d *Device) Run() {
@@ -271,17 +323,17 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	data := make([]byte, 192)
 	copy(data, d.work.Data[:])
 
-	binary.BigEndian.PutUint32(data[128+4*work.TimestampWord:], ts)
-	binary.BigEndian.PutUint32(data[128+4*work.Nonce0Word:], nonce0)
-	binary.BigEndian.PutUint32(data[128+4*work.Nonce1Word:], nonce1)
-	hash := chainhash.HashH(data[0:180])
+	binary.LittleEndian.PutUint32(data[128+4*work.TimestampWord:], ts)
+	binary.LittleEndian.PutUint32(data[128+4*work.Nonce0Word:], nonce0)
+	binary.LittleEndian.PutUint32(data[128+4*work.Nonce1Word:], nonce1)
+	hash := chainhash.Hash(blake3.FinalBlock(d.midstate, data[128:180]))
 
 	// Hashes that reach this logic and fail the minimal proof of
 	// work check are considered to be hardware errors.
 	hashNum := standalone.HashToBig(&hash)
 	if hashNum.Cmp(chainParams.PowLimit) > 0 {
-		minrLog.Errorf("DEV #%d Hardware error found, hash %v above "+
-			"minimum target %064x", d.index, hash, d.work.Target.Bytes())
+		minrLog.Errorf("DEV #%d: Hardware error found, hash %v above "+
+			"minimum target %064x", d.index, hash, chainParams.PowLimit)
 		d.invalidShares++
 		return
 	}
@@ -291,10 +343,10 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	if !cfg.Benchmark {
 		// Assess versus the pool or daemon target.
 		if hashNum.Cmp(d.work.Target) > 0 {
-			minrLog.Debugf("DEV #%d Hash %v bigger than target %032x (boo)",
-				d.index, hash, d.work.Target.Bytes())
+			minrLog.Debugf("DEV #%d: Hash %v bigger than target %064x (boo)",
+				d.index, hash, d.work.Target)
 		} else {
-			minrLog.Infof("DEV #%d Found hash with work below target! %v (yay)",
+			minrLog.Infof("DEV #%d: Found hash with work below target! %v (yay)",
 				d.index, hash)
 			d.validShares++
 			d.workDone <- data
diff --git a/miner.go b/miner.go
index b64a100..4f02462 100644
--- a/miner.go
+++ b/miner.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 package main
 
@@ -8,6 +8,8 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/decred/dcrd/chaincfg/chainhash"
+	"github.com/decred/dcrd/crypto/blake256"
 	"github.com/decred/gominer/stratum"
 	"github.com/decred/gominer/work"
 )
@@ -34,8 +36,6 @@ func NewMiner() (*Miner, error) {
 		needsWorkRefresh: make(chan struct{}),
 	}
 
-	m.devices = make([]*Device, 0)
-
 	// If needed, start pool code.
 	if cfg.Pool != "" && !cfg.Benchmark {
 		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version())
@@ -76,8 +76,8 @@ func (m *Miner) workSubmitThread() {
 				} else {
 					if accepted {
 						atomic.AddUint64(&m.validShares, 1)
-						minrLog.Debugf("Submitted work successfully: %v",
-							accepted)
+						minrLog.Infof("Submitted work successfully: block hash %v",
+							chainhash.Hash(blake256.Sum256(data[:180])))
 					} else {
 						atomic.AddUint64(&m.invalidShares, 1)
 					}
diff --git a/sample-gominer.conf b/sample-gominer.conf
index bf3f38c..630fd8e 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -104,7 +104,7 @@
 ; ------------------------------------------------------------------------------
 
 ; Location of kernel to use for mining (opencl only).
-; kernel=./blake256.cl
+; kernel=./blake3.cl
 
 ; Autocalibrate time target in ms to spend executing hashes for each iteration.
 ; autocalibrate=40
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 9c80ea3..4ea8231 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -951,8 +951,8 @@ func (s *Stratum) PrepWork() error {
 
 	w := work.NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
 
-	log.Tracef("Stratum prepated work data %v, target %032x",
-		hex.EncodeToString(w.Data[:]), w.Target.Bytes())
+	log.Tracef("Stratum prepated work data %x, target %032x", w.Data[:],
+		w.Target.Bytes())
 	s.PoolWork.Work = w
 
 	return nil
diff --git a/util/util.go b/util/util.go
index 9d389bc..d520aaa 100644
--- a/util/util.go
+++ b/util/util.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 package util
 
@@ -96,15 +96,14 @@ func Uint32EndiannessSwap(v uint32) uint32 {
 
 // FormatHashRate sets the units properly when displaying a hashrate.
 func FormatHashRate(h float64) string {
-	if h > 1000000000 {
-		return fmt.Sprintf("%.3fGH/s", h/1000000000)
-	} else if h > 1000000 {
-		return fmt.Sprintf("%.0fMH/s", h/1000000)
-	} else if h > 1000 {
-		return fmt.Sprintf("%.1fkH/s", h/1000)
-	} else if h == 0 {
-		return "0H/s"
+	const unit = 1000
+	if h < unit {
+		return fmt.Sprintf("%.0f h/s", h)
 	}
-
-	return fmt.Sprintf("%.1f GH/s", h)
+	div, exp := float64(unit), 0
+	for n := h / unit; n >= unit && exp < 6; n /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.2f %ch/s", h/div, "kMGTPEZ"[exp])
 }
diff --git a/work/work.go b/work/work.go
index d8b220d..85e5099 100644
--- a/work/work.go
+++ b/work/work.go
@@ -12,6 +12,7 @@ const (
 	Nonce0Word    = 3
 	Nonce1Word    = 4
 	Nonce2Word    = 5
+	Nonce3Word    = 6
 )
 
 // NewWork is the constructor for Work.

From 8467f2b17c14863cd39c68969a8f5f6163738ca5 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Mon, 4 Sep 2023 10:07:20 -0300
Subject: [PATCH 104/150] miner: Fix possible shutdown deadlock.

This is only a very quick fix to prevent it deadlocking during shutdown.
---
 device.go |  5 ++++-
 miner.go  | 11 +++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/device.go b/device.go
index ba3adab..0e72d44 100644
--- a/device.go
+++ b/device.go
@@ -359,7 +359,10 @@ func (d *Device) Stop() {
 }
 
 func (d *Device) SetWork(w *work.Work) {
-	d.newWork <- w
+	select {
+	case d.newWork <- w:
+	case <-d.quit:
+	}
 }
 
 func (d *Device) PrintStats() {
diff --git a/miner.go b/miner.go
index 4f02462..7429641 100644
--- a/miner.go
+++ b/miner.go
@@ -82,7 +82,10 @@ func (m *Miner) workSubmitThread() {
 						atomic.AddUint64(&m.invalidShares, 1)
 					}
 
-					m.needsWorkRefresh <- struct{}{}
+					select {
+					case m.needsWorkRefresh <- struct{}{}:
+					case <-m.quit:
+					}
 				}
 			} else {
 				submitted, err := GetPoolWorkSubmit(data, m.pool)
@@ -101,7 +104,11 @@ func (m *Miner) workSubmitThread() {
 						minrLog.Debugf("Submitted work to pool successfully: %v",
 							submitted)
 					}
-					m.needsWorkRefresh <- struct{}{}
+
+					select {
+					case m.needsWorkRefresh <- struct{}{}:
+					case <-m.quit:
+					}
 				}
 			}
 		}

From 4bc1b4876f7473fbb7c8baab56931e90b23a5c10 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Mon, 4 Sep 2023 09:21:00 -0300
Subject: [PATCH 105/150] cuda: Switch to Blake3.

This switches the CUDA implementation of gominer to use the new Blake3
hashing function.

The CUDA implementation uses the same optimized kernel as the one
developed for the OpenCL implementation, with the necessary changes to
make it compile with the CUDA toolkit.
---
 GNUmakefile           |   18 +-
 README.md             |   11 +-
 blake3.cu             |  182 ++++
 cgo_flags.go          |    4 +-
 compat.h              |   94 --
 cuda_helper.h         |  685 --------------
 cudakernel_static.go  |   17 -
 cudakernel_windows.go |   18 +-
 cudevice.go           |  122 ++-
 decred.cu             |  359 --------
 decred.h              |    4 +-
 miner.h               |  622 -------------
 sph/blake.c           | 1133 -----------------------
 sph/sph_blake.h       |  337 -------
 sph/sph_types.h       | 1976 -----------------------------------------
 15 files changed, 260 insertions(+), 5322 deletions(-)
 create mode 100644 blake3.cu
 delete mode 100644 compat.h
 delete mode 100644 cuda_helper.h
 delete mode 100644 decred.cu
 delete mode 100644 miner.h
 delete mode 100644 sph/blake.c
 delete mode 100644 sph/sph_blake.h
 delete mode 100644 sph/sph_types.h

diff --git a/GNUmakefile b/GNUmakefile
index 1469e55..5dbc8d2 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -17,7 +17,7 @@ endif
 #
 # Windows build assumes that CUDA V7.0 is installed in its default location.
 #
-# Windows gominer requires nvml.dll and decred.dll to reside in the same
+# Windows gominer requires nvml.dll and blake3-decred.dll to reside in the same
 # directory as gominer.exe.
 ifeq ($(ARCH),Msys)
 obj: nvidia
@@ -30,24 +30,24 @@ endif
 	mkdir obj
 
 ifeq ($(ARCH),Msys)
-obj/decred.dll: obj sph/blake.c decred.cu
-	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. -Isph decred.cu sph/blake.c -o obj/decred.dll
+obj/blake3-decred.dll: obj blake3.cu
+	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. blake3.cu -o obj/blake3-decred.dll
 else
-obj/decred.a: obj sph/blake.c decred.cu
-	$(NVCC) --lib --optimize=3 -I. decred.cu sph/blake.c -o obj/decred.a
+obj/blake3.a: obj blake3.cu
+	$(NVCC) --lib --optimize=3 -I. blake3.cu -o obj/blake3.a
 endif
 
 ifeq ($(ARCH),Msys)
-build: obj/decred.dll
+build: obj/blake3-decred.dll
 else
-build: obj/decred.a
+build: obj/blake3.a
 endif
 	go build -tags 'cuda'
 
 ifeq ($(ARCH),Msys)
-install: obj/decred.dll
+install: obj/blake3-decred.dll
 else
-install: obj/decred.a
+install: obj/blake3.a
 endif
 	go install -tags 'cuda'
 
diff --git a/README.md b/README.md
index 66c00a8..afb9aad 100644
--- a/README.md
+++ b/README.md
@@ -78,9 +78,6 @@ $ curl http://localhost:3333/
 
 #### Pre-Requisites
 
-NOTE: The CUDA support has NOT been updated yet for BLAKE3.  Matheus is working
-on adding support, so this section hasn't been modified, but it is out of date.
-
 You will either need to install CUDA for NVIDIA graphics cards or OpenCL
 library/headers that support your device such as: AMDGPU-PRO (for newer AMD
 cards), Beignet (for Intel Graphics), or Catalyst (for older AMD cards).
@@ -89,12 +86,11 @@ For example, on Ubuntu 23.04 you can install the necessary OpenCL packages (for
 Intel Graphics) and CUDA libraries with:
 
 ```
-sudo apt-get install beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit
+sudo apt-get install nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
 gominer has been built successfully on Ubuntu 23.04 with go1.21.0,
-g++ 5.4.0, and beignet-dev 1.1.1-2 although other combinations should work as
-well.
+g++ 5.4.0 although other combinations should work as well.
 
 #### Instructions
 
@@ -135,6 +131,9 @@ go build -tags opencladl
 
 ##### CUDA
 
+**NOTE**: The CUDA version of the Blake3 gominer is not yet compatible to
+windows.
+
 ###### Pre-Requisites
 
 - Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)
diff --git a/blake3.cu b/blake3.cu
new file mode 100644
index 0000000..8f928a0
--- /dev/null
+++ b/blake3.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Decred BLAKE3 midstate-based CUDA kernel
+
+// Written and optimized by Dave Collins Sep 2023.
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif /* _WIN32 */
+
+#define MAX_OUTPUT_RESULTS 32
+
+// Written and optimized by Dave Collins Sep 2023.
+#define ROTR(v, n) __funnelshift_rc((v), (v), n)
+
+__global__
+void search(
+    uint *output,
+    // Midstate.
+    const uint cv0,
+    const uint cv1,
+    const uint cv2,
+    const uint cv3,
+    const uint cv4,
+    const uint cv5,
+    const uint cv6,
+    const uint cv7,
+
+    // Final 52 bytes of data.
+    const uint m0,
+    const uint m1,
+    const uint m2,
+    // const uint m3 : nonce
+    const uint m4,
+    const uint m5,
+    const uint m6,
+    const uint m7,
+    const uint m8,
+    const uint m9,
+    const uint m10,
+    const uint m11,
+    const uint m12)
+{
+    // Nonce.
+    const uint m3 = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // BLAKE3 init vectors.
+    const uint iv0 = 0x6a09e667ul;
+    const uint iv1 = 0xbb67ae85ul;
+    const uint iv2 = 0x3c6ef372ul;
+    const uint iv3 = 0xa54ff53aul;
+    // const uint iv4 = 0x510e527ful;
+    // const uint iv5 = 0x9b05688cul;
+    // const uint iv6 = 0x1f83d9abul;
+    // const uint iv7 = 0x5be0cd19ul;
+
+    // Internal compression func state.
+    uint v0, v1, v2, v3, v4, v5, v6, v7;
+    uint v8, v9, v10, v11, v12, v13, v14, v15;
+
+    // Do the initialization and first round together.
+    // Round 1.
+    v0 = cv0 + cv4 + m0; v12 = ROTR(v0, 16); v8 = iv0 + v12; v4 = ROTR(cv4 ^ v8, 12); v0 += v4 + m1; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = cv1 + cv5 + m2; v13 = ROTR(v1, 16); v9 = iv1 + v13; v5 = ROTR(cv5 ^ v9, 12); v1 += v5 + m3; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = cv2 + cv6 + m4; v14 = ROTR(52 ^ v2, 16); v10 = iv2 + v14; v6 = ROTR(cv6 ^ v10, 12); v2 += v6 + m5; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = cv3 + cv7 + m6; v15 = ROTR(10 ^ v3, 16); v11 = iv3 + v15; v7 = ROTR(cv7 ^ v11, 12); v3 += v7 + m7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m8; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m9; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m10; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m11; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m12; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 2 with message word permutation.
+    v0 = v0 + v4 + m2; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m6; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m3; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m10; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m7; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m0; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m4; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m1; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m11; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m12; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m5; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m9; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m8; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 3 with message word permutation.
+    v0 = v0 + v4 + m3; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m10; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m12; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m2; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m6; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m5; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m9; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m0; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m11; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m8; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m1; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 4 with message word permutation.
+    v0 = v0 + v4 + m10; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4 + m7; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m12; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m9; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m3; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m4; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m0; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m11; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m2; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m5; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m8; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m1; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m6; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 5 with message word permutation.
+    v0 = v0 + v4 + m12; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m9; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m11; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m10; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m8; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5 + m7; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m2; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m5; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m3; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m0; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m1; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m6; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 6 with message word permutation.
+    v0 = v0 + v4 + m9; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m11; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m5; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m8; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m12; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m1; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m3; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m0; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m10; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m2; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m6; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m4; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4 + m7; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Round 7 with message word permutation.
+    v0 = v0 + v4 + m11; v12 = ROTR(v12 ^ v0, 16); v8 = v8 + v12; v4 = ROTR(v4 ^ v8, 12); v0 += v4; v12 = ROTR(v12 ^ v0, 8); v8 += v12; v4 = ROTR(v4 ^ v8, 7);
+    v1 = v1 + v5 + m5; v13 = ROTR(v13 ^ v1, 16); v9 = v9 + v13; v5 = ROTR(v5 ^ v9, 12); v1 += v5 + m0; v13 = ROTR(v13 ^ v1, 8); v9 += v13; v5 = ROTR(v5 ^ v9, 7);
+    v2 = v2 + v6 + m1; v14 = ROTR(v14 ^ v2, 16); v10 += v14; v6 = ROTR(v6 ^ v10, 12); v2 += v6 + m9; v14 = ROTR(v14 ^ v2, 8); v10 += v14; v6 = ROTR(v6 ^ v10, 7);
+    v3 = v3 + v7 + m8; v15 = ROTR(v15 ^ v3, 16); v11 += v15; v7 = ROTR(v7 ^ v11, 12); v3 += v7 + m6; v15 = ROTR(v15 ^ v3, 8); v11 += v15; v7 = ROTR(v7 ^ v11, 7);
+    v0 = v0 + v5; v15 = ROTR(v15 ^ v0, 16); v10 += v15; v5 = ROTR(v5 ^ v10, 12); v0 += v5 + m10; v15 = ROTR(v15 ^ v0, 8); v10 += v15; v5 = ROTR(v5 ^ v10, 7);
+    v1 = v1 + v6 + m2; v12 = ROTR(v12 ^ v1, 16); v11 += v12; v6 = ROTR(v6 ^ v11, 12); v1 += v6 + m12; v12 = ROTR(v12 ^ v1, 8); v11 += v12; v6 = ROTR(v6 ^ v11, 7);
+    v2 = v2 + v7 + m3; v13 = ROTR(v13 ^ v2, 16); v8 += v13; v7 = ROTR(v7 ^ v8, 12); v2 += v7 + m4; v13 = ROTR(v13 ^ v2, 8); v8 += v13; v7 = ROTR(v7 ^ v8, 7);
+    v3 = v3 + v4 + m7; v14 = ROTR(v14 ^ v3, 16); v9 += v14; v4 = ROTR(v4 ^ v9, 12); v3 += v4; v14 = ROTR(v14 ^ v3, 8); v9 += v14; v4 = ROTR(v4 ^ v9, 7);
+
+    // Finally the truncated 256-bit output is defined as:
+    //
+    // h'0 = v0^v8
+    // h'1 = v1^v9
+    // h'2 = v2^v10
+    // h'3 = v3^v11
+    // h'4 = v4^v12
+    // h'5 = v5^v13
+    // h'6 = v6^v14
+    // h'7 = v7^v15
+    //
+    // Just check if the last word (32-bits) is zeroed and return back to the
+    // miner to notify it that a potential solution was found so it can check it
+    // against the target difficulty.
+
+    // Debug code to print result of hashing function.
+    // if (!((v7 ^ v15) & 0xFFFF0000)) {
+    //     printf("hash on gpu %x %x %x %x %x %x %x %x\n",
+    //         v0 ^ v8, v1 ^ v9, v2 ^ v10, v3 ^ v11,
+    //         v4 ^ v12, v5 ^ v13, v6 ^ v14, v7 ^ v15);
+    //     printf("nonce for hash on gpu %x\n", m3);
+    // }
+
+    if (v7 ^ v15)
+        return;
+
+    // Update nonce.
+    uint pos = atomicInc(&output[0], 0xffffffff)+1;
+    if (pos > MAX_OUTPUT_RESULTS) return; // Bounds check output buffer.
+    output[pos] = m3;
+}
+
+
+extern "C" {
+__host__ DLLEXPORT void
+decred_blake3_hash(const uint32_t dimgrid, const uint32_t threads, uint32_t *cv, uint32_t *m, uint32_t *out)
+{
+	search<<<dimgrid, threads>>>(
+		out,
+		cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
+		m[0], m[1], m[2],
+		// m3,
+		m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12]
+
+	);
+}
+}
diff --git a/cgo_flags.go b/cgo_flags.go
index d64e25e..39dfda4 100644
--- a/cgo_flags.go
+++ b/cgo_flags.go
@@ -6,7 +6,7 @@
 package main
 
 /*
-#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/decred.a
-#cgo windows LDFLAGS: -Lobj -ldecred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
+#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/blake3.a
+#cgo windows LDFLAGS: -Lobj -lblake3-decred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
 */
 import "C"
diff --git a/compat.h b/compat.h
deleted file mode 100644
index a98dab9..0000000
--- a/compat.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef __COMPAT_H__
-#define __COMPAT_H__
-
-#ifdef WIN32
-
-#include <windows.h>
-#include <time.h>
-
-#define localtime_r(src, dst) localtime_s(dst, src)
-
-static __inline void sleep(int secs)
-{
-	Sleep(secs * 1000);
-}
-
-enum {
-	PRIO_PROCESS = 0,
-};
-
-extern int opt_priority;
-
-static __inline int setpriority(int which, int who, int prio)
-{
-	switch (opt_priority) {
-		case 5:
-			prio = THREAD_PRIORITY_TIME_CRITICAL;
-			break;
-		case 4:
-			prio = THREAD_PRIORITY_HIGHEST;
-			break;
-		case 3:
-			prio = THREAD_PRIORITY_ABOVE_NORMAL;
-			break;
-		case 2:
-			prio = THREAD_PRIORITY_NORMAL;
-			break;
-		case 1:
-			prio = THREAD_PRIORITY_BELOW_NORMAL;
-			break;
-		case 0:
-		default:
-			prio = THREAD_PRIORITY_IDLE;
-	}
-	return -!SetThreadPriority(GetCurrentThread(), prio);
-}
-
-#ifdef _MSC_VER
-#define snprintf(...) _snprintf(__VA_ARGS__)
-#define strdup(...) _strdup(__VA_ARGS__)
-#define strncasecmp(x,y,z) _strnicmp(x,y,z)
-#define strcasecmp(x,y) _stricmp(x,y)
-typedef int ssize_t;
-
-__inline int msver(void) {
-	switch (_MSC_VER) {
-	case 1500: return 2008;
-	case 1600: return 2010;
-	case 1700: return 2012;
-	case 1800: return 2013;
-	case 1900: return 2015;
-	default: return (_MSC_VER/100);
-	}
-}
-
-#include <stdlib.h>
-static __inline char * dirname(char *file) {
-	char buffer[_MAX_PATH] = { 0 };
-	char drive[_MAX_DRIVE];
-	char dir[_MAX_DIR];
-	char fname[_MAX_FNAME];
-	char ext[_MAX_EXT];
-	_splitpath_s(file, drive, _MAX_DRIVE, dir, _MAX_DIR, fname, _MAX_FNAME, ext, _MAX_EXT);
-	sprintf(buffer, "%s%s", drive, dir);
-	return strdup(buffer);
-}
-#endif
-
-#endif /* WIN32 */
-
-#ifdef _MSC_VER
-# define __func__ __FUNCTION__
-# define __thread __declspec(thread)
-# define _ALIGN(x) __declspec(align(x))
-#else
-# define _ALIGN(x) __attribute__ ((aligned(x)))
-/* dirname() for linux/mingw */
-#include <libgen.h>
-#endif
-
-#ifndef WIN32
-#define MAX_PATH PATH_MAX
-#endif
-
-#endif /* __COMPAT_H__ */
diff --git a/cuda_helper.h b/cuda_helper.h
deleted file mode 100644
index 1358892..0000000
--- a/cuda_helper.h
+++ /dev/null
@@ -1,685 +0,0 @@
-#ifndef CUDA_HELPER_H
-#define CUDA_HELPER_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#ifdef __INTELLISENSE__
-/* reduce vstudio warnings (__byteperm, blockIdx...) */
-#include <device_functions.h>
-#include <device_launch_parameters.h>
-#define __launch_bounds__(max_tpb, min_blocks)
-#endif
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#ifndef UINT32_MAX
-/* slackware need that */
-#define UINT32_MAX UINT_MAX
-#endif
-
-#ifndef MAX_GPUS
-#define MAX_GPUS 16
-#endif
-
-extern "C" short device_map[MAX_GPUS];
-extern "C"  long device_sm[MAX_GPUS];
-
-extern int cuda_arch[MAX_GPUS];
-
-// common functions
-extern int cuda_get_arch(int thr_id);
-extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
-extern void cuda_check_cpu_free(int thr_id);
-extern void cuda_check_cpu_setTarget(const void *ptarget);
-extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
-extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
-extern __device__ __device_builtin__ void __syncthreads(void);
-extern __device__ __device_builtin__ void __threadfence(void);
-
-#ifndef __CUDA_ARCH__
-// define blockDim and threadIdx for host
-extern const dim3 blockDim;
-extern const uint3 threadIdx;
-#endif
-
-#ifndef SPH_C32
-#define SPH_C32(x) (x)
-// #define SPH_C32(x) ((uint32_t)(x ## U))
-#endif
-
-#ifndef SPH_C64
-#define SPH_C64(x) (x)
-// #define SPH_C64(x) ((uint64_t)(x ## ULL))
-#endif
-
-#ifndef SPH_T32
-#define SPH_T32(x) (x)
-// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
-#endif
-
-#ifndef SPH_T64
-#define SPH_T64(x) (x)
-// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#endif
-
-#if __CUDA_ARCH__ < 320
-// Host and Compute 3.0
-#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
-#define __ldg(x) (*(x))
-#else
-// Compute 3.2+
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
-#endif
-
-__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-	return __double_as_longlong(__hiloint2double(HI, LO));
-#else
-	return (uint64_t)LO | (((uint64_t)HI) << 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
-}
-
-// Endian Drehung f�r 32 Bit Typen
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
-{
-	/* device */
-	return __byte_perm(x, x, 0x0123);
-}
-#else
-	/* host */
-	#define cuda_swab32(x) \
-	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
-		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-#endif
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
-{
-	// Input:       77665544 33221100
-	// Output:      00112233 44556677
-	uint64_t result;
-	//result = __byte_perm((uint32_t) x, 0, 0x0123);
-	//return (result << 32) + __byte_perm(_HIDWORD(x), 0, 0x0123);
-	asm("{ .reg .b32 x, y; // swab64\n\t"
-		"mov.b64 {x,y}, %1;\n\t"
-		"prmt.b32 x, x, 0, 0x0123;\n\t"
-		"prmt.b32 y, y, 0, 0x0123;\n\t"
-		"mov.b64 %0, {y,x};\n\t"
-	"}\n" : "=l"(result): "l"(x));
-	return result;
-}
-#else
-	/* host */
-	#define cuda_swab64(x) \
-		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
-#endif
-
-// swap two uint32_t without extra registers
-__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) {
-	x ^= y; y = x ^ y; x ^= y;
-}
-// for other types...
-#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; }
-
-/*********************************************************************/
-// Macros to catch CUDA errors in CUDA runtime calls
-
-#define CUDA_SAFE_CALL(call)                                          \
-do {                                                                  \
-	cudaError_t err = call;                                           \
-	if (cudaSuccess != err) {                                         \
-		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
-		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
-		exit(EXIT_FAILURE);                                           \
-	}                                                                 \
-} while (0)
-
-#define CUDA_CALL_OR_RET(call) do {                                   \
-	cudaError_t err = call;                                           \
-	if (cudaSuccess != err) {                                         \
-		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
-		return;                                                       \
-	}                                                                 \
-} while (0)
-
-#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
-	cudaError_t err = call;                                           \
-	if (cudaSuccess != err) {                                         \
-		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
-		return ret;                                                   \
-	}                                                                 \
-} while (0)
-
-/*********************************************************************/
-#if !defined(__CUDA_ARCH__) || defined(_WIN64)
-#define USE_XOR_ASM_OPTS 0
-#else
-#define USE_XOR_ASM_OPTS 1
-#endif
-
-#if USE_XOR_ASM_OPTS
-// device asm for whirpool
-__device__ __forceinline__
-uint64_t xor1(uint64_t a, uint64_t b)
-{
-	uint64_t result;
-	asm("xor.b64 %0, %1, %2; // xor1" : "=l"(result) : "l"(a), "l"(b));
-	return result;
-}
-#else
-#define xor1(a,b) (a ^ b)
-#endif
-
-#if USE_XOR_ASM_OPTS
-// device asm for whirpool
-__device__ __forceinline__
-uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
-{
-	uint64_t result;
-	asm("xor.b64 %0, %2, %3; // xor3\n\t"
-	    "xor.b64 %0, %0, %1;\n\t"
-		/* output : input registers */
-		: "=l"(result) : "l"(a), "l"(b), "l"(c));
-	return result;
-}
-#else
-#define xor3(a,b,c) (a ^ b ^ c)
-#endif
-
-#if USE_XOR_ASM_OPTS
-// device asm for whirpool
-__device__ __forceinline__
-uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
-{
-	uint64_t result;
-	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
-	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
-	return result;
-}
-#else
-#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
-#endif
-
-// device asm for x17
-__device__ __forceinline__
-uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
-{
-#ifdef __CUDA_ARCH__
-	uint64_t result;
-	asm("{ // xandx \n\t"
-		".reg .u64 n;\n\t"
-		"xor.b64 %0, %2, %3;\n\t"
-		"and.b64 n, %0, %1;\n\t"
-		"xor.b64 %0, n, %3;\n\t"
-	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
-	return result;
-#else
-	return ((b^c) & a) ^ c;
-#endif
-}
-
-// device asm for x17
-__device__ __forceinline__
-uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
-{
-#ifdef __CUDA_ARCH__
-	uint64_t result;
-	asm("{ // andor\n\t"
-		".reg .u64 m,n;\n\t"
-		"and.b64 m,  %1, %2;\n\t"
-		" or.b64 n,  %1, %2;\n\t"
-		"and.b64 %0, n,  %3;\n\t"
-		" or.b64 %0, %0, m;\n\t"
-	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
-	return result;
-#else
-	return ((a | b) & c) | (a & b);
-#endif
-}
-
-// device asm for x17
-__device__ __forceinline__
-uint64_t shr_t64(uint64_t x, uint32_t n)
-{
-#ifdef __CUDA_ARCH__
-	uint64_t result;
-	asm("shr.b64 %0,%1,%2;\n\t"
-	: "=l"(result) : "l"(x), "r"(n));
-	return result;
-#else
-	return x >> n;
-#endif
-}
-
-__device__ __forceinline__
-uint64_t shl_t64(uint64_t x, uint32_t n)
-{
-#ifdef __CUDA_ARCH__
-	uint64_t result;
-	asm("shl.b64 %0,%1,%2;\n\t"
-	: "=l"(result) : "l"(x), "r"(n));
-	return result;
-#else
-	return x << n;
-#endif
-}
-
-__device__ __forceinline__
-uint32_t shr_t32(uint32_t x,uint32_t n) {
-#ifdef __CUDA_ARCH__
-	uint32_t result;
-	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
-	return result;
-#else
-	return x >> n;
-#endif
-}
-
-__device__ __forceinline__
-uint32_t shl_t32(uint32_t x,uint32_t n) {
-#ifdef __CUDA_ARCH__
-	uint32_t result;
-	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
-	return result;
-#else
-	return x << n;
-#endif
-}
-
-#ifndef USE_ROT_ASM_OPT
-#define USE_ROT_ASM_OPT 1
-#endif
-
-// 64-bit ROTATE RIGHT
-#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
-/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
-__device__ __forceinline__
-uint64_t ROTR64(const uint64_t value, const int offset) {
-	uint2 result;
-	if(offset < 32) {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-	} else {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-	}
-	return __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
-__device__ __forceinline__
-uint64_t ROTR64(const uint64_t x, const int offset)
-{
-	uint64_t result;
-	asm("{ // ROTR64 \n\t"
-		".reg .b64 lhs;\n\t"
-		".reg .u32 roff;\n\t"
-		"shr.b64 lhs, %1, %2;\n\t"
-		"sub.u32 roff, 64, %2;\n\t"
-		"shl.b64 %0, %1, roff;\n\t"
-		"add.u64 %0, %0, lhs;\n\t"
-	"}\n" : "=l"(result) : "l"(x), "r"(offset));
-	return result;
-}
-#else
-/* host */
-#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
-#endif
-
-// 64-bit ROTATE LEFT
-#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
-__device__ __forceinline__
-uint64_t ROTL64(const uint64_t value, const int offset) {
-	uint2 result;
-	if(offset >= 32) {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-	} else {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-	}
-	return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
-__device__ __forceinline__
-uint64_t ROTL64(const uint64_t x, const int offset)
-{
-	uint64_t result;
-	asm("{ // ROTL64 \n\t"
-		".reg .b64 lhs;\n\t"
-		".reg .u32 roff;\n\t"
-		"shl.b64 lhs, %1, %2;\n\t"
-		"sub.u32 roff, 64, %2;\n\t"
-		"shr.b64 %0, %1, roff;\n\t"
-		"add.u64 %0, lhs, %0;\n\t"
-	"}\n" : "=l"(result) : "l"(x), "r"(offset));
-	return result;
-}
-#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
-__device__
-uint64_t ROTL64(const uint64_t x, const int offset)
-{
-	uint64_t res;
-	asm("{ // ROTL64 \n\t"
-		".reg .u32 tl,th,vl,vh;\n\t"
-		".reg .pred p;\n\t"
-		"mov.b64 {tl,th}, %1;\n\t"
-		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
-		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
-		"setp.lt.u32 p, %2, 32;\n\t"
-		"@!p mov.b64 %0, {vl,vh};\n\t"
-		"@p  mov.b64 %0, {vh,vl};\n\t"
-	"}\n" : "=l"(res) : "l"(x) , "r"(offset)
-	);
-	return res;
-}
-#else
-/* host */
-#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-
-__device__ __forceinline__
-uint64_t SWAPDWORDS(uint64_t value)
-{
-#if __CUDA_ARCH__ >= 320
-	uint2 temp;
-	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
-	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
-	return value;
-#else
-	return ROTL64(value, 32);
-#endif
-}
-
-/* lyra2/bmw - uint2 vector's operators */
-
-__device__ __forceinline__
-void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
-#ifdef __CUDA_ARCH__
-	asm("mov.b64 {%0,%1},%2; \n\t"
-		: "=r"(lo), "=r"(hi) : "l"(x));
-#else
-	lo = (uint32_t)(x);
-	hi = (uint32_t)(x >> 32);
-#endif
-}
-
-static __host__ __device__ __forceinline__ uint2 vectorize(uint64_t v) {
-	uint2 result;
-#ifdef __CUDA_ARCH__
-	asm("mov.b64 {%0,%1},%2; \n\t"
-		: "=r"(result.x), "=r"(result.y) : "l"(v));
-#else
-	result.x = (uint32_t)(v);
-	result.y = (uint32_t)(v >> 32);
-#endif
-	return result;
-}
-
-static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
-#ifdef __CUDA_ARCH__
-	return MAKE_ULONGLONG(v.x, v.y);
-#else
-	return (((uint64_t)v.y) << 32) + v.x;
-#endif
-}
-
-/**
- * uint2 direct ops by c++ operator definitions
- */
-static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
-static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
-static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
-static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
-static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
-
-static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
-#ifdef __CUDA_ARCH__
-	uint2 result;
-	asm("{ // uint2 a+b \n\t"
-		"add.cc.u32 %0, %2, %4; \n\t"
-		"addc.u32   %1, %3, %5; \n\t"
-	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
-	return result;
-#else
-	return vectorize(devectorize(a) + devectorize(b));
-#endif
-}
-static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
-
-
-static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) {
-#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
-	uint2 result;
-	asm("{ // uint2 a-b \n\t"
-		"sub.cc.u32 %0, %2, %4; \n\t"
-		"subc.u32   %1, %3, %5; \n\t"
-	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
-	return result;
-#else
-	return vectorize(devectorize(a) - devectorize(b));
-#endif
-}
-static __device__ __forceinline__ void operator-= (uint2 &a, uint2 b) { a = a - b; }
-
-/**
- * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
- * (what does uint64 "*" operator)
- */
-static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
-{
-#ifdef __CUDA_ARCH__
-	uint2 result;
-	asm("{ // uint2 a*b \n\t"
-		"mul.lo.u32       %0, %2, %4;  \n\t"
-		"mul.hi.u32       %1, %2, %4;  \n\t"
-		"mad.lo.cc.u32    %1, %3, %4, %1; \n\t"
-		"madc.lo.u32      %1, %3, %5, %1; \n\t"
-	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
-	return result;
-#else
-	// incorrect but unused host equiv
-	return make_uint2(a.x * b.x, a.y * b.y);
-#endif
-}
-
-// uint2 ROR/ROL methods
-__device__ __forceinline__
-uint2 ROR2(const uint2 a, const int offset)
-{
-	uint2 result;
-#if __CUDA_ARCH__ > 300
-	if (offset < 32) {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
-	} else /* if (offset < 64) */ {
-		/* offset SHOULD BE < 64 ! */
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
-	}
-#else
-	if (!offset)
-		result = a;
-	else if (offset < 32) {
-		result.y = ((a.y >> offset) | (a.x << (32 - offset)));
-		result.x = ((a.x >> offset) | (a.y << (32 - offset)));
-	} else if (offset == 32) {
-		result.y = a.x;
-		result.x = a.y;
-	} else {
-		result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
-		result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
-	}
-#endif
-	return result;
-}
-
-__device__ __forceinline__
-uint2 ROL2(const uint2 a, const int offset)
-{
-	uint2 result;
-#if __CUDA_ARCH__ > 300
-	if (offset >= 32) {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
-	}
-	else {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
-	}
-#else
-	if (!offset)
-		result = a;
-	else
-		result = ROR2(a, 64 - offset);
-#endif
-	return result;
-}
-
-__device__ __forceinline__
-uint2 SWAPUINT2(uint2 value)
-{
-	return make_uint2(value.y, value.x);
-}
-
-/* Byte aligned Rotations (lyra2) */
-#ifdef __CUDA_ARCH__
-__device__ __inline__ uint2 ROL8(const uint2 a)
-{
-	uint2 result;
-	result.x = __byte_perm(a.y, a.x, 0x6543);
-	result.y = __byte_perm(a.y, a.x, 0x2107);
-	return result;
-}
-
-__device__ __inline__ uint2 ROR16(const uint2 a)
-{
-	uint2 result;
-	result.x = __byte_perm(a.y, a.x, 0x1076);
-	result.y = __byte_perm(a.y, a.x, 0x5432);
-	return result;
-}
-
-__device__ __inline__ uint2 ROR24(const uint2 a)
-{
-	uint2 result;
-	result.x = __byte_perm(a.y, a.x, 0x2107);
-	result.y = __byte_perm(a.y, a.x, 0x6543);
-	return result;
-}
-#else
-#define ROL8(u)  ROL2(u, 8)
-#define ROR16(u) ROR2(u,16)
-#define ROR24(u) ROR2(u,24)
-#endif
-
-/* uint2 for bmw512 - to double check later */
-
-__device__ __forceinline__
-static uint2 SHL2(uint2 a, int offset)
-{
-#if __CUDA_ARCH__ > 300
-	uint2 result;
-	if (offset < 32)  {
-		asm("{ // SHL2 (l) \n\t"
-			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
-			"shl.b32         %0, %2, %4;     \n\t"
-		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
-	} else {
-		asm("{ // SHL2 (h) \n\t"
-			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
-			"shl.b32         %0, %2, %4;     \n\t"
-		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
-	}
-	return result;
-#else
-	if (offset <= 32) {
-		a.y = (a.y << offset) | (a.x >> (32 - offset));
-		a.x = (a.x << offset);
-	} else {
-		a.y = (a.x << (offset-32));
-		a.x = 0;
-	}
-	return a;
-#endif
-}
-
-__device__ __forceinline__
-static uint2 SHR2(uint2 a, int offset)
-{
-#if __CUDA_ARCH__ > 300
-	uint2 result;
-	if (offset<32) {
-		asm("{\n\t"
-			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
-			"shr.b32 %1,%3,%4; \n\t"
-			"}\n\t"
-			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
-	} else {
-		asm("{\n\t"
-			"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
-			"shl.b32 %1,%3,%4; \n\t"
-			"}\n\t"
-			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
-	}
-	return result;
-#else
-	if (offset <= 32) {
-		a.x = (a.x >> offset) | (a.y << (32 - offset));
-		a.y = (a.y >> offset);
-	} else {
-		a.x = (a.y >> (offset - 32));
-		a.y = 0;
-	}
-	return a;
-#endif
-}
-
-#endif // #ifndef CUDA_HELPER_H
diff --git a/cudakernel_static.go b/cudakernel_static.go
index 1ee560e..30360db 100644
--- a/cudakernel_static.go
+++ b/cudakernel_static.go
@@ -9,20 +9,3 @@ package main
 #include "decred.h"
 */
 import "C"
-import (
-	"unsafe"
-
-	"github.com/barnex/cuda5/cu"
-)
-
-func cudaPrecomputeTable(input *[192]byte) {
-	if input == nil {
-		panic("input is nil")
-	}
-	C.decred_cpu_setBlock_52((*C.uint32_t)(unsafe.Pointer(input)))
-}
-
-func cudaInvokeKernel(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
-	C.decred_hash_nonce(C.uint32_t(gridx), C.uint32_t(blockx), C.uint32_t(threads),
-		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
-}
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
index 6ece5f3..3e3b028 100644
--- a/cudakernel_windows.go
+++ b/cudakernel_windows.go
@@ -6,23 +6,9 @@ package main
 
 import (
 	"syscall"
-	"unsafe"
-
-	"github.com/barnex/cuda5/cu"
 )
 
 var (
-	//kernelDll           = syscall.MustLoadDLL("decred.dll")
-	kernelDll               = syscall.MustLoadDLL("decred.dll")
-	precomputeTableProcAddr = kernelDll.MustFindProc("decred_cpu_setBlock_52").Addr()
-	kernelProcAddr          = kernelDll.MustFindProc("decred_hash_nonce").Addr()
+	kernelDll      = syscall.MustLoadDLL("blake3-decred.dll")
+	kernelProcAddr = kernelDll.MustFindProc("decred_blake3_hash").Addr()
 )
-
-func cudaPrecomputeTable(input *[192]byte) {
-	syscall.Syscall(precomputeTableProcAddr, 1, uintptr(unsafe.Pointer(input)), 0, 0)
-}
-
-func cudaInvokeKernel(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
-	syscall.Syscall6(kernelProcAddr, 6, uintptr(gridx), uintptr(blockx), uintptr(threads),
-		uintptr(startNonce), uintptr(nonceResults), uintptr(targetHigh))
-}
diff --git a/cudevice.go b/cudevice.go
index fe015c9..b6438f8 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -11,7 +11,6 @@ package main
 import "C"
 
 import (
-	"encoding/binary"
 	"fmt"
 	"reflect"
 	"runtime"
@@ -28,9 +27,15 @@ import (
 )
 
 const (
-	// From ccminer
-	threadsPerBlock = 640
-	blockx          = threadsPerBlock
+	// threadsPerBlock is the nb of CUDA threads per processing block.
+	threadsPerBlock = 1024
+
+	// dimGrid is the nb of CUDA blocks to issue.
+	dimGrid = 65504
+
+	// maxOutputNbs is the max number of individual output results. MUST
+	// match what is defined in decred.cu.
+	maxOutputResults = 32
 )
 
 // Return the GPU library in use.
@@ -38,11 +43,6 @@ func gpuLib() string {
 	return "CUDA"
 }
 
-const (
-	localWorksize      = 64
-	cuOutputBufferSize = 64
-)
-
 type Device struct {
 	// The following variables must only be used atomically.
 	fanPercent  uint32
@@ -62,11 +62,7 @@ type Device struct {
 	tempTarget               uint32
 
 	// Items for CUDA device
-	cuDeviceID     cu.Device
-	cuInSize       int64
-	cuOutputBuffer []float64
-
-	workSize uint32
+	cuDeviceID cu.Device
 
 	// extraNonce is the device extraNonce, where the first
 	// byte is the device ID (supporting up to 255 devices)
@@ -92,16 +88,11 @@ type Device struct {
 	quit chan struct{}
 }
 
-func decredCPUSetBlock52(input *[192]byte) {
-	if input == nil {
-		panic("input is nil")
-	}
-	C.decred_cpu_setBlock_52((*C.uint32_t)(unsafe.Pointer(input)))
-}
-
-func decredHashNonce(gridx, blockx, threads uint32, startNonce uint32, nonceResults cu.DevicePtr, targetHigh uint32) {
-	C.decred_hash_nonce(C.uint32_t(gridx), C.uint32_t(blockx), C.uint32_t(threads),
-		C.uint32_t(startNonce), (*C.uint32_t)(unsafe.Pointer(nonceResults)), C.uint32_t(targetHigh))
+func decredBlake3Hash(dimgrid, threads uint32, midstate, lastblock unsafe.Pointer, out cu.DevicePtr) {
+	C.decred_blake3_hash(C.uint(dimgrid), C.uint(threads),
+		(*C.uint)(midstate),
+		(*C.uint)(lastblock),
+		(*C.uint)(unsafe.Pointer(out)))
 }
 
 func deviceStats(index int) (uint32, uint32) {
@@ -198,6 +189,11 @@ func ListDevices() {
 func NewCuDevice(index int, order int, deviceID cu.Device,
 	workDone chan []byte) (*Device, error) {
 
+	devProps := cu.DeviceGetProperties(deviceID)
+	minrLog.Infof("CUDA device %.2x props: MaxThreadsPerBlock: %d, MaxThreadsDim: %v, "+
+		"MaxGridSize: %v, RegsPerBlock: %d", deviceID, devProps.MaxThreadsPerBlock,
+		devProps.MaxThreadsDim, devProps.MaxGridSize, devProps.RegsPerBlock)
+
 	d := &Device{
 		index:       index,
 		cuDeviceID:  deviceID,
@@ -213,8 +209,6 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 		tempTarget:  0,
 	}
 
-	d.cuInSize = 21
-
 	if !deviceLibraryInitialized {
 		err := nvml.Init()
 		if err != nil {
@@ -292,20 +286,44 @@ func (d *Device) runDevice() error {
 	// at compile time.
 
 	minrLog.Infof("Started GPU #%d: %s", d.index, d.deviceName)
-	nonceResultsH := cu.MallocHost(d.cuInSize * 4)
-	nonceResultsD := cu.Malloc(d.cuInSize * 4)
+
+	const WORDSZ = 4 // Everything is sent as uint32.
+
+	// Setup input buffers.
+	midstateSz := int64(len(d.midstate) * WORDSZ)
+	midstateH := cu.MallocHost(midstateSz)
+	defer cu.MemFreeHost(midstateH)
+	midstateHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(midstateH),
+		Len:  int(midstateSz),
+		Cap:  int(midstateSz),
+	}
+	midstateHSlice := *(*[]uint32)(unsafe.Pointer(&midstateHSliceHeader))
+
+	lastBlockSz := int64(len(d.lastBlock) * WORDSZ)
+	lastBlockH := cu.MallocHost(lastBlockSz)
+	defer cu.MemFreeHost(lastBlockH)
+	lastBlockHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(lastBlockH),
+		Len:  int(lastBlockSz),
+		Cap:  int(lastBlockSz),
+	}
+	lastBlockHSlice := *(*[]uint32)(unsafe.Pointer(&lastBlockHSliceHeader))
+
+	// Setup output buffer.
+	nonceResultsH := cu.MallocHost(maxOutputResults * WORDSZ)
+	nonceResultsD := cu.Malloc(maxOutputResults * WORDSZ)
 	defer cu.MemFreeHost(nonceResultsH)
 	defer nonceResultsD.Free()
 
 	nonceResultsHSliceHeader := reflect.SliceHeader{
 		Data: uintptr(nonceResultsH),
-		Len:  int(d.cuInSize),
-		Cap:  int(d.cuInSize),
+		Len:  int(maxOutputResults),
+		Cap:  int(maxOutputResults),
 	}
 	nonceResultsHSlice := *(*[]uint32)(unsafe.Pointer(&nonceResultsHSliceHeader))
 
-	endianData := new([192]byte)
-
+	// Mining loop.
 	for {
 		d.updateCurrentWork()
 
@@ -319,16 +337,6 @@ func (d *Device) runDevice() error {
 		util.RolloverExtraNonce(&d.extraNonce)
 		d.lastBlock[work.Nonce1Word] = d.extraNonce
 
-		copy(endianData[:], d.work.Data[:128])
-		for i, j := 128, 0; i < 180; {
-			b := make([]byte, 4)
-			binary.BigEndian.PutUint32(b, d.lastBlock[j])
-			copy(endianData[i:], b)
-			i += 4
-			j++
-		}
-		decredCPUSetBlock52(endianData)
-
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
 		ts := d.work.JobTime
@@ -338,26 +346,22 @@ func (d *Device) runDevice() error {
 		}
 		d.lastBlock[work.TimestampWord] = ts
 
+		// Clear the results buffer.
 		nonceResultsHSlice[0] = 0
+		cu.MemcpyHtoD(nonceResultsD, nonceResultsH, maxOutputResults*WORDSZ)
 
-		cu.MemcpyHtoD(nonceResultsD, nonceResultsH, d.cuInSize*4)
+		// Copy data into the input buffers.
+		copy(midstateHSlice, d.midstate[:])
+		copy(lastBlockHSlice, d.lastBlock[:])
 
 		// Execute the kernel and follow its execution time.
 		currentTime := time.Now()
+		decredBlake3Hash(dimGrid, threadsPerBlock, midstateH, lastBlockH, nonceResultsD)
 
-		startNonce := d.lastBlock[work.Nonce1Word]
-
-		throughput := uint32(0x20000000)
-		//gridx := ((throughput - 1) / 640)
-
-		gridx := uint32(52428) // like ccminer
-
-		targetHigh := ^uint32(0)
-
-		decredHashNonce(gridx, blockx, throughput, startNonce, nonceResultsD, targetHigh)
-
-		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, d.cuInSize)
+		// Copy results back from device to host.
+		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, maxOutputResults*WORDSZ)
 
+		// Verify the results.
 		numResults := nonceResultsHSlice[0]
 		for i, result := range nonceResultsHSlice[1 : 1+numResults] {
 			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
@@ -378,14 +382,6 @@ func (d *Device) runDevice() error {
 	}
 }
 
-func minUint32(a, b uint32) uint32 {
-	if a > b {
-		return a
-	} else {
-		return b
-	}
-}
-
 func newMinerDevs(m *Miner) (*Miner, int, error) {
 	deviceListIndex := 0
 	deviceListEnabledCount := 0
diff --git a/decred.cu b/decred.cu
deleted file mode 100644
index d855f3b..0000000
--- a/decred.cu
+++ /dev/null
@@ -1,359 +0,0 @@
-/**
- * Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2/6.1)
- *
- * Tanguy Pruvot - Feb 2016
- *
- * Merged 8-round blake (XVC) tweaks
- * Further improved by: ~2.72%
- * Alexis Provos - Jun 2016
- */
-
-// nvcc  -I. -c decred.cu --ptx
-
-#include <stdint.h>
-#include <memory.h>
-#include "miner.h"
-
-#if defined(_WIN32)
-#define DLLEXPORT __declspec(dllexport)
-#else
-#define DLLEXPORT
-#endif /* _WIN32 */
-
-extern "C" {
-#include "sph/sph_blake.h"
-}
-
-/* threads per block */
-#define TPB 640
-
-/* max count of found nonces in one call (like sgminer) */
-#define maxResults 4
-
-/* hash by cpu with blake 256 */
-extern "C" void decred_hash(void *output, const void *input)
-{
-	sph_blake256_context ctx;
-
-	sph_blake256_set_rounds(14);
-
-	sph_blake256_init(&ctx);
-	sph_blake256(&ctx, input, 180);
-	sph_blake256_close(&ctx, output);
-}
-
-#include "cuda_helper.h"
-
-#ifdef __INTELLISENSE__
-#define __byte_perm(x, y, b) x
-#define atomicInc(p, max) (*p)++
-#endif
-
-__constant__ uint32_t _ALIGN(16) c_h[2];
-__constant__ uint32_t _ALIGN(16) c_data[32];
-__constant__ uint32_t _ALIGN(16) c_xors[215];
-
-#define ROR8(a)  __byte_perm(a, 0, 0x0321)
-#define ROL16(a) __byte_perm(a, 0, 0x1032)
-
-/* macro bodies */
-#define pxorGS(a,b,c,d) { \
-	v[a]+= c_xors[i++] + v[b]; \
-	v[d] = ROL16(v[d] ^ v[a]); \
-	v[c]+= v[d]; \
-	v[b] = ROTR32(v[b] ^ v[c], 12); \
-	v[a]+= c_xors[i++] + v[b]; \
-	v[d] = ROR8(v[d] ^ v[a]); \
-	v[c]+= v[d]; \
-	v[b] = ROTR32(v[b] ^ v[c], 7); \
-}
-
-#define pxorGS2(a,b,c,d, a1,b1,c1,d1) {\
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
-}
-
-#define pxory1GS2(a,b,c,d, a1,b1,c1,d1) { \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
-	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
-}
-
-#define pxory0GS2(a,b,c,d, a1,b1,c1,d1) { \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
-	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
-}
-
-#define pxorx1GS2(a,b,c,d, a1,b1,c1,d1) { \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
-	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
-	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
-}
-
-#define pxorx0GS2(a,b,c,d, a1,b1,c1,d1) { \
-	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROL16(v[ d] ^ v[ a]); 	        v[d1] = ROL16(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
-	v[ a]+= c_xors[i++] + v[ b]; 			v[a1]+= c_xors[i++] + v[b1]; \
-	v[ d] = ROR8(v[ d] ^ v[ a]); 	        v[d1] = ROR8(v[d1] ^ v[a1]); \
-	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
-	v[ b] = ROTR32(v[ b] ^ v[ c], 7); 		v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
-}
-
-extern "C"
-{
-
-//__global__ __launch_bounds__(TPB,1)
-__global__ void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t highTarget)
-{
-	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if (thread < threads)
-	{
-		uint32_t v[16];
-		#pragma unroll
-		for(int i=0; i<16; i+=4) {
-			*(uint4*)&v[i] = *(uint4*)&c_data[i];
-		}
-
-		const uint32_t nonce = startNonce + thread;
-		v[ 1]+= (nonce ^ 0x13198A2E);
-		v[13] = ROR8(v[13] ^ v[1]);
-		v[ 9]+= v[13];
-		v[ 5] = ROTR32(v[5] ^ v[9], 7);
-
-		int i = 0;
-		v[ 1]+= c_xors[i++];// + v[ 6];
-		v[ 0]+= v[5];
-		v[12] = ROL16(v[12] ^ v[ 1]);         v[15] = ROL16(v[15] ^ v[ 0]);
-		v[11]+= v[12];                        v[10]+= v[15];
-		v[ 6] = ROTR32(v[ 6] ^ v[11], 12);    v[ 5] = ROTR32(v[5] ^ v[10], 12);
-		v[ 1]+= c_xors[i++] + v[ 6];          v[ 0]+= c_xors[i++] + v[ 5];
-		v[12] = ROR8(v[12] ^ v[ 1]);          v[15] = ROR8(v[15] ^ v[ 0]);
-		v[11]+= v[12];                        v[10]+= v[15];
-		v[ 6] = ROTR32(v[ 6] ^ v[11], 7);     v[ 5] = ROTR32(v[ 5] ^ v[10], 7);
-
-		pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxory1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorx1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory0GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx0GS2( 2, 7, 8, 13, 3, 4, 9, 14);
-		pxory1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
-		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
-		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS(    2, 7, 8, 13);
-
-		if ((c_h[1]^v[15]) == v[7]) {
-		        uint32_t pos = atomicInc(&resNonce[0], UINT32_MAX)+1;
-			resNonce[pos] = nonce;
-			return;
-		}
-	}
-}
-}
-
-extern "C" {
-DLLEXPORT void
-decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads,
-    uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh)
-{
-	decred_gpu_hash_nonce <<<grid, block>>> (threads, startNonce, resNonce, targetHigh);
-}
-}
-
-extern "C" {
-__host__ DLLEXPORT void
-decred_cpu_setBlock_52(const uint32_t *input)
-{
-	/*
-	for (int i = 0; i < 180/4; i++)
-		printf("%08x", input[i]);
-	printf("\n");
-	*/
-/*
-	Precompute everything possible and pass it on constant memory
-*/
-	const uint32_t z[16] = {
-		0x243F6A88U, 0x85A308D3U, 0x13198A2EU, 0x03707344U,
-		0xA4093822U, 0x299F31D0U, 0x082EFA98U, 0xEC4E6C89U,
-		0x452821E6U, 0x38D01377U, 0xBE5466CFU, 0x34E90C6CU,
-		0xC0AC29B7U, 0xC97C50DDU, 0x3F84D5B5U, 0xB5470917U
-	};
-
-	int i=0;
-	uint32_t _ALIGN(64) preXOR[215];
-	uint32_t _ALIGN(64)   data[16];
-	uint32_t _ALIGN(64)      m[16];
-	uint32_t _ALIGN(64)      h[ 2];
-
-	sph_blake256_context ctx;
-	sph_blake256_set_rounds(14);
-	sph_blake256_init(&ctx);
-	sph_blake256(&ctx, input, 128);
-
-	data[ 0] = ctx.H[0];
-	data[ 1] = ctx.H[1];
-	data[ 2] = ctx.H[2];
-	data[ 3] = ctx.H[3];
-	data[ 4] = ctx.H[4];
-	data[ 5] = ctx.H[5];
-	data[ 8] = ctx.H[6];
-
-	data[12] = swab32(input[35]);
-	data[13] = ctx.H[7];
-
-	// pre swab32
-	m[ 0] = swab32(input[32]);	m[ 1] = swab32(input[33]);
-	m[ 2] = swab32(input[34]);	m[ 3] = 0;
-	m[ 4] = swab32(input[36]);	m[ 5] = swab32(input[37]);
-	m[ 6] = swab32(input[38]);	m[ 7] = swab32(input[39]);
-	m[ 8] = swab32(input[40]);	m[ 9] = swab32(input[41]);
-	m[10] = swab32(input[42]);	m[11] = swab32(input[43]);
-	m[12] = swab32(input[44]);	m[13] = 0x80000001;
-	m[14] = 0;
-	m[15] = 0x000005a0;
-
-	h[ 0] = data[ 8];
-	h[ 1] = data[13];
-
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_h,h, 8, 0, cudaMemcpyHostToDevice));
-
-	data[ 0]+= (m[ 0] ^ z[1]) + data[ 4];
-	data[12]  = SPH_ROTR32(z[4] ^ SPH_C32(0x5A0) ^ data[ 0], 16);
-
-	data[ 8] = z[0]+data[12];
-	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 12);
-	data[ 0]+= (m[ 1] ^ z[0]) + data[ 4];
-	data[12] = SPH_ROTR32(data[12] ^ data[ 0],8);
-	data[ 8]+= data[12];
-	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 7);
-
-	data[ 1]+= (m[ 2] ^ z[3]) + data[ 5];
-	data[13] = SPH_ROTR32((z[5] ^ SPH_C32(0x5A0)) ^ data[ 1], 16);
-	data[ 9] = z[1]+data[13];
-	data[ 5] = SPH_ROTR32(data[ 5] ^ data[ 9], 12);
-	data[ 1]+= data[ 5]; //+nonce ^ ...
-
-	data[ 2]+= (m[ 4] ^ z[5]) + h[ 0];
-	data[14] = SPH_ROTR32(z[6] ^ data[ 2],16);
-	data[10] = z[2] + data[14];
-	data[ 6] = SPH_ROTR32(h[ 0] ^ data[10], 12);
-	data[ 2]+= (m[ 5] ^ z[4]) + data[ 6];
-	data[14] = SPH_ROTR32(data[14] ^ data[ 2], 8);
-	data[10]+= data[14];
-	data[ 6] = SPH_ROTR32(data[ 6] ^ data[10], 7);
-
-	data[ 3]+= (m[ 6] ^ z[7]) + h[ 1];
-	data[15] = SPH_ROTR32(z[7] ^ data[ 3],16);
-	data[11] = z[3] + data[15];
-	data[ 7] = SPH_ROTR32(h[ 1] ^ data[11], 12);
-	data[ 3]+= (m[ 7] ^ z[6]) + data[ 7];
-	data[15] = SPH_ROTR32(data[15] ^ data[ 3],8);
-	data[11]+= data[15];
-	data[ 7] = SPH_ROTR32(data[11] ^ data[ 7], 7);
-	data[ 0]+= m[ 8] ^ z[9];
-
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, 64, 0, cudaMemcpyHostToDevice));
-
-#define precalcXORGS(x,y) { \
-	preXOR[i++]= (m[x] ^ z[y]); \
-	preXOR[i++]= (m[y] ^ z[x]); \
-}
-#define precalcXORGS2(x,y,x1,y1){\
-	preXOR[i++] = (m[ x] ^ z[ y]);\
-	preXOR[i++] = (m[x1] ^ z[y1]);\
-	preXOR[i++] = (m[ y] ^ z[ x]);\
-	preXOR[i++] = (m[y1] ^ z[x1]);\
-}
-	precalcXORGS(10,11);
-	preXOR[ 0]+=data[ 6];
-	preXOR[i++] = (m[9] ^ z[8]);
-	precalcXORGS2(12,13,14,15);
-	precalcXORGS2(14,10, 4, 8);
-	precalcXORGS2( 9,15,13, 6);
-	precalcXORGS2( 1,12, 0, 2);
-	precalcXORGS2(11, 7, 5, 3);
-	precalcXORGS2(11, 8,12, 0);
-	precalcXORGS2( 5, 2,15,13);
-	precalcXORGS2(10,14, 3, 6);
-	precalcXORGS2( 7, 1, 9, 4);
-	precalcXORGS2( 7, 9, 3, 1);
-	precalcXORGS2(13,12,11,14);
-	precalcXORGS2( 2, 6, 5,10);
-	precalcXORGS2( 4, 0,15, 8);
-	precalcXORGS2( 9, 0, 5, 7);
-	precalcXORGS2( 2, 4,10,15);
-	precalcXORGS2(14, 1,11,12);
-	precalcXORGS2( 6, 8, 3,13);
-	precalcXORGS2( 2,12, 6,10);
-	precalcXORGS2( 0,11, 8, 3);
-	precalcXORGS2( 4,13, 7, 5);
-	precalcXORGS2(15,14, 1, 9);
-	precalcXORGS2(12, 5, 1,15);
-	precalcXORGS2(14,13, 4,10);
-	precalcXORGS2( 0, 7, 6, 3);
-	precalcXORGS2( 9, 2, 8,11);
-	precalcXORGS2(13,11, 7,14);
-	precalcXORGS2(12, 1, 3, 9);
-	precalcXORGS2( 5, 0,15, 4);
-	precalcXORGS2( 8, 6, 2,10);
-	precalcXORGS2( 6,15,14, 9);
-	precalcXORGS2(11, 3, 0, 8);
-	precalcXORGS2(12, 2,13, 7);
-	precalcXORGS2( 1, 4,10, 5);
-	precalcXORGS2(10, 2, 8, 4);
-	precalcXORGS2( 7, 6, 1, 5);
-	precalcXORGS2(15,11, 9,14);
-	precalcXORGS2( 3,12,13, 0);
-	precalcXORGS2( 0, 1, 2, 3);
-	precalcXORGS2( 4, 5, 6, 7);
-	precalcXORGS2( 8, 9,10,11);
-	precalcXORGS2(12,13,14,15);
-	precalcXORGS2(14,10, 4, 8);
-	precalcXORGS2( 9,15,13, 6);
-	precalcXORGS2( 1,12, 0, 2);
-	precalcXORGS2(11, 7, 5, 3);
-	precalcXORGS2(11, 8,12, 0);
-	precalcXORGS2( 5, 2,15,13);
-	precalcXORGS2(10,14, 3, 6);
-	precalcXORGS2( 7, 1, 9, 4);
-	precalcXORGS2( 7, 9, 3, 1);
-	precalcXORGS2(13,12,11,14);
-	precalcXORGS2( 2, 6, 5,10);
-	precalcXORGS( 4, 0);
-	precalcXORGS(15, 8);
-
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_xors, preXOR, 215*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-}
-}
-
-/* ############################################################################################################################### */
-
diff --git a/decred.h b/decred.h
index fb7ca6e..c28aa62 100644
--- a/decred.h
+++ b/decred.h
@@ -7,9 +7,7 @@
 extern "C" {
 #endif /* __cplusplus */
 
-void	decred_hash_nonce(uint32_t grid, uint32_t block, uint32_t threads,
-	    uint32_t startNonce, uint32_t *resNonce, uint32_t targetHigh);
-void	decred_cpu_setBlock_52(const uint32_t *input);
+void decred_blake3_hash(const uint32_t dimgrid, const uint32_t threads, uint32_t *midstate, uint32_t *lastblock, uint32_t *out);
 
 #ifdef __cplusplus
 }
diff --git a/miner.h b/miner.h
deleted file mode 100644
index 4b3e1ae..0000000
--- a/miner.h
+++ /dev/null
@@ -1,622 +0,0 @@
-#ifndef __MINER_H__
-#define __MINER_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//#include <ccminer-config.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <inttypes.h>
-
-#ifdef _MSC_VER
-#undef HAVE_ALLOCA_H
-#undef HAVE_SYSLOG_H
-#endif
-
-#ifdef STDC_HEADERS
-# include <stdlib.h>
-# include <stddef.h>
-#else
-# ifdef HAVE_STDLIB_H
-#  include <stdlib.h>
-# endif
-#endif
-
-#ifdef HAVE_ALLOCA_H
-# include <alloca.h>
-#elif !defined alloca
-# ifdef __GNUC__
-#  define alloca __builtin_alloca
-# elif defined _AIX
-#  define alloca __alloca
-# elif defined _MSC_VER
-#  include <malloc.h>
-#  define alloca _alloca
-# elif !defined HAVE_ALLOCA
-void *alloca (size_t);
-# endif
-#endif
-
-#include "compat.h"
-
-#ifdef __INTELLISENSE__
-/* should be in stdint.h but... */
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int16 int8_t;
-typedef unsigned __int16 uint8_t;
-
-typedef unsigned __int32 time_t;
-typedef char *  va_list;
-#endif
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
-# undef _ALIGN
-# define _ALIGN(x) __align__(x)
-#endif
-
-#ifdef HAVE_SYSLOG_H
-#include <syslog.h>
-#define LOG_BLUE 0x10
-#define LOG_RAW  0x99
-#else
-enum {
-	LOG_ERR,
-	LOG_WARNING,
-	LOG_NOTICE,
-	LOG_INFO,
-	LOG_DEBUG,
-	/* custom notices */
-	LOG_BLUE = 0x10,
-	LOG_RAW  = 0x99
-};
-#endif
-
-typedef unsigned char uchar;
-
-#undef unlikely
-#undef likely
-#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
-#define unlikely(expr) (__builtin_expect(!!(expr), 0))
-#define likely(expr) (__builtin_expect(!!(expr), 1))
-#else
-#define unlikely(expr) (expr)
-#define likely(expr) (expr)
-#endif
-
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-#endif
-
-#ifndef max
-# define max(a, b)  ((a) > (b) ? (a) : (b))
-#endif
-#ifndef min
-# define min(a, b)  ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef UINT32_MAX
-/* for gcc 4.4 */
-#define UINT32_MAX UINT_MAX
-#endif
-
-static inline bool is_windows(void) {
-#ifdef WIN32
-        return 1;
-#else
-        return 0;
-#endif
-}
-
-#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-#define WANT_BUILTIN_BSWAP
-#else
-#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
-                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-#define bswap_64(x) (((uint64_t) bswap_32((uint32_t)((x) & 0xffffffffu)) << 32) \
-                   | (uint64_t) bswap_32((uint32_t)((x) >> 32)))
-#endif
-
-static inline uint32_t swab32(uint32_t v)
-{
-#ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap32(v);
-#else
-	return bswap_32(v);
-#endif
-}
-
-static inline uint64_t swab64(uint64_t v)
-{
-#ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap64(v);
-#else
-	return bswap_64(v);
-#endif
-}
-
-static inline void swab256(void *dest_p, const void *src_p)
-{
-	uint32_t *dest = (uint32_t *) dest_p;
-	const uint32_t *src = (const uint32_t *) src_p;
-
-	dest[0] = swab32(src[7]);
-	dest[1] = swab32(src[6]);
-	dest[2] = swab32(src[5]);
-	dest[3] = swab32(src[4]);
-	dest[4] = swab32(src[3]);
-	dest[5] = swab32(src[2]);
-	dest[6] = swab32(src[1]);
-	dest[7] = swab32(src[0]);
-}
-
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-#if !HAVE_DECL_BE32DEC
-static inline uint32_t be32dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
-	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
-}
-#endif
-
-#if !HAVE_DECL_LE32DEC
-static inline uint32_t le32dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
-	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
-}
-#endif
-
-#if !HAVE_DECL_BE32ENC
-static inline void be32enc(void *pp, uint32_t x)
-{
-	uint8_t *p = (uint8_t *)pp;
-	p[3] = x & 0xff;
-	p[2] = (x >> 8) & 0xff;
-	p[1] = (x >> 16) & 0xff;
-	p[0] = (x >> 24) & 0xff;
-}
-#endif
-
-#if !HAVE_DECL_LE32ENC
-static inline void le32enc(void *pp, uint32_t x)
-{
-	uint8_t *p = (uint8_t *)pp;
-	p[0] = x & 0xff;
-	p[1] = (x >> 8) & 0xff;
-	p[2] = (x >> 16) & 0xff;
-	p[3] = (x >> 24) & 0xff;
-}
-#endif
-
-#if !HAVE_DECL_BE16DEC
-static inline uint16_t be16dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-	return ((uint16_t)(p[1]) + ((uint16_t)(p[0]) << 8));
-}
-#endif
-
-#if !HAVE_DECL_BE16ENC
-static inline void be16enc(void *pp, uint16_t x)
-{
-	uint8_t *p = (uint8_t *)pp;
-	p[1] = x & 0xff;
-	p[0] = (x >> 8) & 0xff;
-}
-#endif
-
-#if !HAVE_DECL_LE16DEC
-static inline uint16_t le16dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-	return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8));
-}
-#endif
-
-#if !HAVE_DECL_LE16ENC
-static inline void le16enc(void *pp, uint16_t x)
-{
-	uint8_t *p = (uint8_t *)pp;
-	p[0] = x & 0xff;
-	p[1] = (x >> 8) & 0xff;
-}
-#endif
-
-/* used for struct work */
-void *aligned_calloc(int size);
-void aligned_free(void *ptr);
-
-#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
-
-void sha256_init(uint32_t *state);
-void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-void sha256d(unsigned char *hash, const unsigned char *data, int len);
-
-#define HAVE_SHA256_4WAY 0
-#define HAVE_SHA256_8WAY 0
-
-struct work;
-
-/* api related */
-void *api_thread(void *userdata);
-void api_set_throughput(int thr_id, uint32_t throughput);
-
-struct cgpu_info {
-	uint8_t gpu_id;
-	uint8_t thr_id;
-	int accepted;
-	int rejected;
-	int hw_errors;
-	double khashes;
-	uint8_t intensity_int;
-	uint8_t has_monitoring;
-	float gpu_temp;
-	uint16_t gpu_fan;
-	uint16_t gpu_fan_rpm;
-	uint16_t gpu_arch;
-	int gpu_clock;
-	int gpu_memclock;
-	size_t gpu_mem;
-	size_t gpu_memfree;
-	uint32_t gpu_power;
-	double gpu_vddc;
-	int16_t gpu_pstate;
-	int16_t gpu_bus;
-	uint16_t gpu_vid;
-	uint16_t gpu_pid;
-
-	int8_t nvml_id;
-	int8_t nvapi_id;
-
-	char gpu_sn[64];
-	char gpu_desc[64];
-	float intensity;
-	uint32_t throughput;
-};
-
-struct stats_data {
-	uint32_t uid;
-	uint32_t tm_stat;
-	uint32_t hashcount;
-	uint32_t height;
-
-	double difficulty;
-	double hashrate;
-
-	uint8_t thr_id;
-	uint8_t gpu_id;
-	uint8_t hashfound;
-	uint8_t ignored;
-
-	uint8_t npool;
-	uint8_t pool_type;
-	uint16_t align;
-};
-
-struct hashlog_data {
-	uint8_t npool;
-	uint8_t pool_type;
-	uint16_t align;
-
-	uint32_t height;
-	uint32_t njobid;
-	uint32_t nonce;
-	uint32_t scanned_from;
-	uint32_t scanned_to;
-	uint32_t last_from;
-	uint32_t tm_add;
-	uint32_t tm_upd;
-	uint32_t tm_sent;
-};
-
-/* end of api */
-
-
-struct work_restart {
-	/* volatile to modify accross threads (vstudio thing) */
-	volatile uint32_t restart;
-	char padding[128 - sizeof(uint32_t)];
-};
-
-#ifdef HAVE_GETOPT_LONG
-#include <getopt.h>
-#else
-struct option {
-	const char *name;
-	int has_arg;
-	int *flag;
-	int val;
-};
-#endif
-extern int options_count();
-
-extern bool opt_benchmark;
-extern bool opt_debug;
-extern bool opt_quiet;
-extern bool opt_protocol;
-extern bool opt_showdiff;
-extern bool opt_tracegpu;
-extern int opt_n_threads;
-extern int active_gpus;
-extern int gpu_threads;
-extern int opt_timeout;
-extern bool want_longpoll;
-extern bool have_longpoll;
-extern bool want_stratum;
-extern bool have_stratum;
-extern bool opt_stratum_stats;
-extern char *opt_cert;
-extern char *opt_proxy;
-extern long opt_proxy_type;
-extern bool use_syslog;
-extern bool use_colors;
-extern int use_pok;
-extern struct thr_info *thr_info;
-extern int longpoll_thr_id;
-extern int stratum_thr_id;
-extern int api_thr_id;
-extern volatile bool abort_flag;
-extern struct work_restart *work_restart;
-extern bool opt_trust_pool;
-extern uint16_t opt_vote;
-
-extern uint64_t global_hashrate;
-extern uint64_t net_hashrate;
-extern double net_diff;
-extern double stratum_diff;
-
-#define MAX_GPUS 16
-//#define MAX_THREADS 32 todo
-extern char* device_name[MAX_GPUS];
-extern short device_map[MAX_GPUS];
-extern long  device_sm[MAX_GPUS];
-extern uint32_t gpus_intensity[MAX_GPUS];
-extern int opt_cudaschedule;
-
-// cuda.cpp
-int cuda_num_devices();
-void cuda_devicenames();
-void cuda_reset_device(int thr_id, bool *init);
-void cuda_shutdown();
-int cuda_finddevice(char *name);
-int cuda_version();
-void cuda_print_devices();
-int cuda_gpu_info(struct cgpu_info *gpu);
-int cuda_available_memory(int thr_id);
-
-uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
-#define device_intensity(t,f,d) cuda_default_throughput(t,d)
-
-void cuda_log_lasterror(int thr_id, const char* func, int line);
-void cuda_clear_lasterror();
-#define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__)
-
-extern void format_hashrate(double hashrate, char *output);
-extern void applog(int prio, const char *fmt, ...);
-extern void gpulog(int prio, int thr_id, const char *fmt, ...);
-void get_defconfig_path(char *out, size_t bufsize, char *argv0);
-extern void cbin2hex(char *out, const char *in, size_t len);
-extern char *bin2hex(const unsigned char *in, size_t len);
-extern bool hex2bin(void *output, const char *hexstr, size_t len);
-extern int timeval_subtract(struct timeval *result, struct timeval *x,
-	struct timeval *y);
-extern bool fulltest(const uint32_t *hash, const uint32_t *target);
-void diff_to_target(uint32_t* target, double diff);
-void work_set_target(struct work* work, double diff);
-double target_to_diff(uint32_t* target);
-extern void get_currentalgo(char* buf, int sz);
-
-// bignum
-double bn_convert_nbits(const uint32_t nbits);
-void bn_nbits_to_uchar(const uint32_t nBits, uchar *target);
-double bn_hash_target_ratio(uint32_t* hash, uint32_t* target);
-void bn_store_hash_target_ratio(uint32_t* hash, uint32_t* target, struct work* work);
-void work_set_target_ratio(struct work* work, uint32_t* hash);
-
-// bench
-extern int bench_algo;
-void bench_init(int threads);
-void bench_free();
-bool bench_algo_switch_next(int thr_id);
-void bench_set_throughput(int thr_id, uint32_t throughput);
-void bench_display_results();
-
-struct stratum_job {
-	char *job_id;
-	unsigned char prevhash[32];
-	size_t coinbase_size;
-	unsigned char *coinbase;
-	unsigned char *xnonce2;
-	int merkle_count;
-	unsigned char **merkle;
-	unsigned char version[4];
-	unsigned char nbits[4];
-	unsigned char ntime[4];
-	unsigned char claim[32]; // lbry
-	bool clean;
-	unsigned char nreward[2];
-	uint32_t height;
-	double diff;
-};
-
-#define POK_MAX_TXS   4
-#define POK_MAX_TX_SZ 16384U
-struct tx {
-	uint8_t data[POK_MAX_TX_SZ];
-	uint32_t len;
-};
-
-struct work {
-	uint32_t data[48];
-	uint32_t target[8];
-	uint32_t maxvote;
-
-	char job_id[128];
-	size_t xnonce2_len;
-	uchar xnonce2[32];
-
-	union {
-		uint32_t u32[2];
-		uint64_t u64[1];
-	} noncerange;
-
-	uint32_t nonces[2];
-
-	double targetdiff;
-	double shareratio;
-	double sharediff;
-	uint32_t height;
-	uint8_t  pooln;
-
-	uint32_t scanned_from;
-	uint32_t scanned_to;
-
-	/* pok getwork txs */
-	uint32_t tx_count;
-	struct tx txs[POK_MAX_TXS];
-};
-
-#define POK_BOOL_MASK 0x00008000
-#define POK_DATA_MASK 0xFFFF0000
-
-
-#define MAX_POOLS 8
-struct pool_infos {
-	uint8_t id;
-#define POOL_UNUSED   0
-#define POOL_GETWORK  1
-#define POOL_STRATUM  2
-#define POOL_LONGPOLL 4
-	uint8_t type;
-#define POOL_ST_DEFINED 1
-#define POOL_ST_VALID 2
-#define POOL_ST_DISABLED 4
-#define POOL_ST_REMOVED 8
-	uint16_t status;
-	int algo;
-	char name[64];
-	// credentials
-	char url[512];
-	char short_url[64];
-	char user[64];
-	char pass[384];
-	// config options
-	double max_diff;
-	double max_rate;
-	int shares_limit;
-	int time_limit;
-	int scantime;
-	// connection
-	uint8_t allow_gbt;
-	uint8_t allow_mininginfo;
-	uint16_t check_dups; // 16_t for align
-	int retries;
-	int fail_pause;
-	int timeout;
-	// stats
-	uint32_t work_time;
-	uint32_t wait_time;
-	uint32_t accepted_count;
-	uint32_t rejected_count;
-	uint32_t solved_count;
-	uint32_t stales_count;
-	time_t last_share_time;
-	double best_share;
-	uint32_t disconnects;
-};
-
-extern struct pool_infos pools[MAX_POOLS];
-extern int num_pools;
-extern volatile int cur_pooln;
-
-void pool_init_defaults(void);
-void pool_set_creds(int pooln);
-void pool_set_attr(int pooln, const char* key, char* arg);
-bool pool_switch_url(char *params);
-bool pool_switch(int thr_id, int pooln);
-bool pool_switch_next(int thr_id);
-int pool_get_first_valid(int startfrom);
-void pool_dump_infos(void);
-
-bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
-bool stratum_send_line(struct stratum_ctx *sctx, char *s);
-char *stratum_recv_line(struct stratum_ctx *sctx);
-bool stratum_connect(struct stratum_ctx *sctx, const char *url);
-void stratum_disconnect(struct stratum_ctx *sctx);
-bool stratum_subscribe(struct stratum_ctx *sctx);
-bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
-bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
-void stratum_free_job(struct stratum_ctx *sctx);
-
-void hashlog_remember_submit(struct work* work, uint32_t nonce);
-void hashlog_remember_scan_range(struct work* work);
-uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
-uint32_t hashlog_get_last_sent(char* jobid);
-uint64_t hashlog_get_scan_range(char* jobid);
-int  hashlog_get_history(struct hashlog_data *data, int max_records);
-void hashlog_purge_old(void);
-void hashlog_purge_job(char* jobid);
-void hashlog_purge_all(void);
-void hashlog_dump_job(char* jobid);
-void hashlog_getmeminfo(uint64_t *mem, uint32_t *records);
-
-void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height);
-double stats_get_speed(int thr_id, double def_speed);
-double stats_get_gpu_speed(int gpu_id);
-int  stats_get_history(int thr_id, struct stats_data *data, int max_records);
-void stats_purge_old(void);
-void stats_purge_all(void);
-void stats_getmeminfo(uint64_t *mem, uint32_t *records);
-
-struct thread_q;
-
-extern struct thread_q *tq_new(void);
-extern void tq_free(struct thread_q *tq);
-extern bool tq_push(struct thread_q *tq, void *data);
-extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
-extern void tq_freeze(struct thread_q *tq);
-extern void tq_thaw(struct thread_q *tq);
-
-#define EXIT_CODE_OK            0
-#define EXIT_CODE_USAGE         1
-#define EXIT_CODE_POOL_TIMEOUT  2
-#define EXIT_CODE_SW_INIT_ERROR 3
-#define EXIT_CODE_CUDA_NODEVICE 4
-#define EXIT_CODE_CUDA_ERROR    5
-#define EXIT_CODE_TIME_LIMIT    0
-#define EXIT_CODE_KILLED        7
-
-void parse_arg(int key, char *arg);
-void proper_exit(int reason);
-void restart_threads(void);
-
-size_t time2str(char* buf, time_t timer);
-char* atime2str(time_t timer);
-
-void applog_hex(void *data, int len);
-void applog_hash(void *hash);
-void applog_hash64(void *hash);
-void applog_compare_hash(void *hash, void *hash_ref);
-
-void print_hash_tests(void);
-void decred_hash(void *state, const void *input);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __MINER_H__ */
diff --git a/sph/blake.c b/sph/blake.c
deleted file mode 100644
index f2d6613..0000000
--- a/sph/blake.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-//+build ignore
-
-/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
-/*
- * BLAKE implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-#include <limits.h>
-
-#include "sph_blake.h"
-
-int blake256_rounds = 14;
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_SMALL_FOOTPRINT_BLAKE   1
-#endif
-
-#if SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_COMPACT_BLAKE_32   1
-#endif
-
-#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
-#define SPH_COMPACT_BLAKE_64   1
-#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-static const sph_u32 IV224[8] = {
-	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507),
-	SPH_C32(0x3070DD17), SPH_C32(0xF70E5939),
-	SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
-	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
-};
-
-static const sph_u32 IV256[8] = {
-	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
-	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
-	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
-	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
-};
-
-#if SPH_64
-
-static const sph_u64 IV384[8] = {
-	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
-	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
-	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
-	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
-};
-
-static const sph_u64 IV512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
-#endif
-
-#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
-
-static const unsigned sigma[16][16] = {
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
-};
-
-/*
-  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
- 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
- 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
-  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
-  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
-  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
- 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
- 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
-  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
- 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
-*/
-#endif
-
-#define Z00   0
-#define Z01   1
-#define Z02   2
-#define Z03   3
-#define Z04   4
-#define Z05   5
-#define Z06   6
-#define Z07   7
-#define Z08   8
-#define Z09   9
-#define Z0A   A
-#define Z0B   B
-#define Z0C   C
-#define Z0D   D
-#define Z0E   E
-#define Z0F   F
-
-#define Z10   E
-#define Z11   A
-#define Z12   4
-#define Z13   8
-#define Z14   9
-#define Z15   F
-#define Z16   D
-#define Z17   6
-#define Z18   1
-#define Z19   C
-#define Z1A   0
-#define Z1B   2
-#define Z1C   B
-#define Z1D   7
-#define Z1E   5
-#define Z1F   3
-
-#define Z20   B
-#define Z21   8
-#define Z22   C
-#define Z23   0
-#define Z24   5
-#define Z25   2
-#define Z26   F
-#define Z27   D
-#define Z28   A
-#define Z29   E
-#define Z2A   3
-#define Z2B   6
-#define Z2C   7
-#define Z2D   1
-#define Z2E   9
-#define Z2F   4
-
-#define Z30   7
-#define Z31   9
-#define Z32   3
-#define Z33   1
-#define Z34   D
-#define Z35   C
-#define Z36   B
-#define Z37   E
-#define Z38   2
-#define Z39   6
-#define Z3A   5
-#define Z3B   A
-#define Z3C   4
-#define Z3D   0
-#define Z3E   F
-#define Z3F   8
-
-#define Z40   9
-#define Z41   0
-#define Z42   5
-#define Z43   7
-#define Z44   2
-#define Z45   4
-#define Z46   A
-#define Z47   F
-#define Z48   E
-#define Z49   1
-#define Z4A   B
-#define Z4B   C
-#define Z4C   6
-#define Z4D   8
-#define Z4E   3
-#define Z4F   D
-
-#define Z50   2
-#define Z51   C
-#define Z52   6
-#define Z53   A
-#define Z54   0
-#define Z55   B
-#define Z56   8
-#define Z57   3
-#define Z58   4
-#define Z59   D
-#define Z5A   7
-#define Z5B   5
-#define Z5C   F
-#define Z5D   E
-#define Z5E   1
-#define Z5F   9
-
-#define Z60   C
-#define Z61   5
-#define Z62   1
-#define Z63   F
-#define Z64   E
-#define Z65   D
-#define Z66   4
-#define Z67   A
-#define Z68   0
-#define Z69   7
-#define Z6A   6
-#define Z6B   3
-#define Z6C   9
-#define Z6D   2
-#define Z6E   8
-#define Z6F   B
-
-#define Z70   D
-#define Z71   B
-#define Z72   7
-#define Z73   E
-#define Z74   C
-#define Z75   1
-#define Z76   3
-#define Z77   9
-#define Z78   5
-#define Z79   0
-#define Z7A   F
-#define Z7B   4
-#define Z7C   8
-#define Z7D   6
-#define Z7E   2
-#define Z7F   A
-
-#define Z80   6
-#define Z81   F
-#define Z82   E
-#define Z83   9
-#define Z84   B
-#define Z85   3
-#define Z86   0
-#define Z87   8
-#define Z88   C
-#define Z89   2
-#define Z8A   D
-#define Z8B   7
-#define Z8C   1
-#define Z8D   4
-#define Z8E   A
-#define Z8F   5
-
-#define Z90   A
-#define Z91   2
-#define Z92   8
-#define Z93   4
-#define Z94   7
-#define Z95   6
-#define Z96   1
-#define Z97   5
-#define Z98   F
-#define Z99   B
-#define Z9A   9
-#define Z9B   E
-#define Z9C   3
-#define Z9D   C
-#define Z9E   D
-#define Z9F   0
-
-#define Mx(r, i)    Mx_(Z ## r ## i)
-#define Mx_(n)      Mx__(n)
-#define Mx__(n)     M ## n
-
-#define CSx(r, i)   CSx_(Z ## r ## i)
-#define CSx_(n)     CSx__(n)
-#define CSx__(n)    CS ## n
-
-#define CS0   SPH_C32(0x243F6A88)
-#define CS1   SPH_C32(0x85A308D3)
-#define CS2   SPH_C32(0x13198A2E)
-#define CS3   SPH_C32(0x03707344)
-#define CS4   SPH_C32(0xA4093822)
-#define CS5   SPH_C32(0x299F31D0)
-#define CS6   SPH_C32(0x082EFA98)
-#define CS7   SPH_C32(0xEC4E6C89)
-#define CS8   SPH_C32(0x452821E6)
-#define CS9   SPH_C32(0x38D01377)
-#define CSA   SPH_C32(0xBE5466CF)
-#define CSB   SPH_C32(0x34E90C6C)
-#define CSC   SPH_C32(0xC0AC29B7)
-#define CSD   SPH_C32(0xC97C50DD)
-#define CSE   SPH_C32(0x3F84D5B5)
-#define CSF   SPH_C32(0xB5470917)
-
-#if SPH_COMPACT_BLAKE_32
-
-static const sph_u32 CS[16] = {
-	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
-	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
-	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
-	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
-	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
-	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
-	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
-	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
-};
-
-#endif
-
-#if SPH_64
-
-#define CBx(r, i)   CBx_(Z ## r ## i)
-#define CBx_(n)     CBx__(n)
-#define CBx__(n)    CB ## n
-
-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
-
-#if SPH_COMPACT_BLAKE_64
-
-static const sph_u64 CB[16] = {
-	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
-};
-
-#endif
-
-#endif
-
-#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
-		a = SPH_T32(a + b + (m0 ^ c1)); \
-		d = SPH_ROTR32(d ^ a, 16); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 12); \
-		a = SPH_T32(a + b + (m1 ^ c0)); \
-		d = SPH_ROTR32(d ^ a, 8); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 7); \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_32
-
-#define ROUND_S(r)   do { \
-		GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-			CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
-		GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-			CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
-		GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-			CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
-		GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-			CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
-		GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-			CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
-		GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-			CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
-		GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-			CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
-		GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-			CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
-	} while (0)
-
-#else
-
-#define ROUND_S(r)   do { \
-		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
-		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
-		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
-		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
-		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
-		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
-		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
-		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-#endif
-
-#if SPH_64
-
-#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
-		a = SPH_T64(a + b + (m0 ^ c1)); \
-		d = SPH_ROTR64(d ^ a, 32); \
-		c = SPH_T64(c + d); \
-		b = SPH_ROTR64(b ^ c, 25); \
-		a = SPH_T64(a + b + (m1 ^ c0)); \
-		d = SPH_ROTR64(d ^ a, 16); \
-		c = SPH_T64(c + d); \
-		b = SPH_ROTR64(b ^ c, 11); \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-#define ROUND_B(r)   do { \
-		GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-			CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
-		GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-			CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
-		GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-			CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
-		GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-			CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
-		GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-			CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
-		GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-			CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
-		GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-			CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
-		GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-			CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
-	} while (0)
-
-#else
-
-#define ROUND_B(r)   do { \
-		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
-		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
-		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
-		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
-		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
-		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
-		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
-		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-#endif
-
-#endif
-
-#define DECL_STATE32 \
-	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
-	sph_u32 S0, S1, S2, S3, T0, T1;
-
-#define READ_STATE32(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE32(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_32
-
-#define COMPRESS32   do { \
-		sph_u32 M[16]; \
-		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
-		unsigned r; \
-		V0 = H0; \
-		V1 = H1; \
-		V2 = H2; \
-		V3 = H3; \
-		V4 = H4; \
-		V5 = H5; \
-		V6 = H6; \
-		V7 = H7; \
-		V8 = S0 ^ CS0; \
-		V9 = S1 ^ CS1; \
-		VA = S2 ^ CS2; \
-		VB = S3 ^ CS3; \
-		VC = T0 ^ CS4; \
-		VD = T0 ^ CS5; \
-		VE = T1 ^ CS6; \
-		VF = T1 ^ CS7; \
-		M[0x0] = sph_dec32be_aligned(buf +  0); \
-		M[0x1] = sph_dec32be_aligned(buf +  4); \
-		M[0x2] = sph_dec32be_aligned(buf +  8); \
-		M[0x3] = sph_dec32be_aligned(buf + 12); \
-		M[0x4] = sph_dec32be_aligned(buf + 16); \
-		M[0x5] = sph_dec32be_aligned(buf + 20); \
-		M[0x6] = sph_dec32be_aligned(buf + 24); \
-		M[0x7] = sph_dec32be_aligned(buf + 28); \
-		M[0x8] = sph_dec32be_aligned(buf + 32); \
-		M[0x9] = sph_dec32be_aligned(buf + 36); \
-		M[0xA] = sph_dec32be_aligned(buf + 40); \
-		M[0xB] = sph_dec32be_aligned(buf + 44); \
-		M[0xC] = sph_dec32be_aligned(buf + 48); \
-		M[0xD] = sph_dec32be_aligned(buf + 52); \
-		M[0xE] = sph_dec32be_aligned(buf + 56); \
-		M[0xF] = sph_dec32be_aligned(buf + 60); \
-		for (r = 0; r < blake256_rounds; r ++) \
-			ROUND_S(r); \
-		H0 ^= S0 ^ V0 ^ V8; \
-		H1 ^= S1 ^ V1 ^ V9; \
-		H2 ^= S2 ^ V2 ^ VA; \
-		H3 ^= S3 ^ V3 ^ VB; \
-		H4 ^= S0 ^ V4 ^ VC; \
-		H5 ^= S1 ^ V5 ^ VD; \
-		H6 ^= S2 ^ V6 ^ VE; \
-		H7 ^= S3 ^ V7 ^ VF; \
-	} while (0)
-
-#else
-
-#define COMPRESS32   do { \
-		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
-		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
-		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
-		V0 = H0; \
-		V1 = H1; \
-		V2 = H2; \
-		V3 = H3; \
-		V4 = H4; \
-		V5 = H5; \
-		V6 = H6; \
-		V7 = H7; \
-		V8 = S0 ^ CS0; \
-		V9 = S1 ^ CS1; \
-		VA = S2 ^ CS2; \
-		VB = S3 ^ CS3; \
-		VC = T0 ^ CS4; \
-		VD = T0 ^ CS5; \
-		VE = T1 ^ CS6; \
-		VF = T1 ^ CS7; \
-		M0 = sph_dec32be_aligned(buf +  0); \
-		M1 = sph_dec32be_aligned(buf +  4); \
-		M2 = sph_dec32be_aligned(buf +  8); \
-		M3 = sph_dec32be_aligned(buf + 12); \
-		M4 = sph_dec32be_aligned(buf + 16); \
-		M5 = sph_dec32be_aligned(buf + 20); \
-		M6 = sph_dec32be_aligned(buf + 24); \
-		M7 = sph_dec32be_aligned(buf + 28); \
-		M8 = sph_dec32be_aligned(buf + 32); \
-		M9 = sph_dec32be_aligned(buf + 36); \
-		MA = sph_dec32be_aligned(buf + 40); \
-		MB = sph_dec32be_aligned(buf + 44); \
-		MC = sph_dec32be_aligned(buf + 48); \
-		MD = sph_dec32be_aligned(buf + 52); \
-		ME = sph_dec32be_aligned(buf + 56); \
-		MF = sph_dec32be_aligned(buf + 60); \
-		ROUND_S(0); \
-		ROUND_S(1); \
-		ROUND_S(2); \
-		ROUND_S(3); \
-		ROUND_S(4); \
-		ROUND_S(5); \
-		ROUND_S(6); \
-		ROUND_S(7); \
-		if (blake256_rounds == 14) { \
-		ROUND_S(8); \
-		ROUND_S(9); \
-		ROUND_S(0); \
-		ROUND_S(1); \
-		ROUND_S(2); \
-		ROUND_S(3); \
-		} \
-		H0 ^= S0 ^ V0 ^ V8; \
-		H1 ^= S1 ^ V1 ^ V9; \
-		H2 ^= S2 ^ V2 ^ VA; \
-		H3 ^= S3 ^ V3 ^ VB; \
-		H4 ^= S0 ^ V4 ^ VC; \
-		H5 ^= S1 ^ V5 ^ VD; \
-		H6 ^= S2 ^ V6 ^ VE; \
-		H7 ^= S3 ^ V7 ^ VF; \
-	} while (0)
-
-#endif
-
-#if SPH_64
-
-#define DECL_STATE64 \
-	sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \
-	sph_u64 S0, S1, S2, S3, T0, T1;
-
-#define READ_STATE64(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE64(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-#define COMPRESS64   do { \
-		sph_u64 M[16]; \
-		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
-		unsigned r; \
-		V0 = H0; \
-		V1 = H1; \
-		V2 = H2; \
-		V3 = H3; \
-		V4 = H4; \
-		V5 = H5; \
-		V6 = H6; \
-		V7 = H7; \
-		V8 = S0 ^ CB0; \
-		V9 = S1 ^ CB1; \
-		VA = S2 ^ CB2; \
-		VB = S3 ^ CB3; \
-		VC = T0 ^ CB4; \
-		VD = T0 ^ CB5; \
-		VE = T1 ^ CB6; \
-		VF = T1 ^ CB7; \
-		M[0x0] = sph_dec64be_aligned(buf +   0); \
-		M[0x1] = sph_dec64be_aligned(buf +   8); \
-		M[0x2] = sph_dec64be_aligned(buf +  16); \
-		M[0x3] = sph_dec64be_aligned(buf +  24); \
-		M[0x4] = sph_dec64be_aligned(buf +  32); \
-		M[0x5] = sph_dec64be_aligned(buf +  40); \
-		M[0x6] = sph_dec64be_aligned(buf +  48); \
-		M[0x7] = sph_dec64be_aligned(buf +  56); \
-		M[0x8] = sph_dec64be_aligned(buf +  64); \
-		M[0x9] = sph_dec64be_aligned(buf +  72); \
-		M[0xA] = sph_dec64be_aligned(buf +  80); \
-		M[0xB] = sph_dec64be_aligned(buf +  88); \
-		M[0xC] = sph_dec64be_aligned(buf +  96); \
-		M[0xD] = sph_dec64be_aligned(buf + 104); \
-		M[0xE] = sph_dec64be_aligned(buf + 112); \
-		M[0xF] = sph_dec64be_aligned(buf + 120); \
-		for (r = 0; r < 16; r ++) \
-			ROUND_B(r); \
-		H0 ^= S0 ^ V0 ^ V8; \
-		H1 ^= S1 ^ V1 ^ V9; \
-		H2 ^= S2 ^ V2 ^ VA; \
-		H3 ^= S3 ^ V3 ^ VB; \
-		H4 ^= S0 ^ V4 ^ VC; \
-		H5 ^= S1 ^ V5 ^ VD; \
-		H6 ^= S2 ^ V6 ^ VE; \
-		H7 ^= S3 ^ V7 ^ VF; \
-	} while (0)
-
-#else
-
-#define COMPRESS64   do { \
-		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
-		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
-		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
-		V0 = H0; \
-		V1 = H1; \
-		V2 = H2; \
-		V3 = H3; \
-		V4 = H4; \
-		V5 = H5; \
-		V6 = H6; \
-		V7 = H7; \
-		V8 = S0 ^ CB0; \
-		V9 = S1 ^ CB1; \
-		VA = S2 ^ CB2; \
-		VB = S3 ^ CB3; \
-		VC = T0 ^ CB4; \
-		VD = T0 ^ CB5; \
-		VE = T1 ^ CB6; \
-		VF = T1 ^ CB7; \
-		M0 = sph_dec64be_aligned(buf +   0); \
-		M1 = sph_dec64be_aligned(buf +   8); \
-		M2 = sph_dec64be_aligned(buf +  16); \
-		M3 = sph_dec64be_aligned(buf +  24); \
-		M4 = sph_dec64be_aligned(buf +  32); \
-		M5 = sph_dec64be_aligned(buf +  40); \
-		M6 = sph_dec64be_aligned(buf +  48); \
-		M7 = sph_dec64be_aligned(buf +  56); \
-		M8 = sph_dec64be_aligned(buf +  64); \
-		M9 = sph_dec64be_aligned(buf +  72); \
-		MA = sph_dec64be_aligned(buf +  80); \
-		MB = sph_dec64be_aligned(buf +  88); \
-		MC = sph_dec64be_aligned(buf +  96); \
-		MD = sph_dec64be_aligned(buf + 104); \
-		ME = sph_dec64be_aligned(buf + 112); \
-		MF = sph_dec64be_aligned(buf + 120); \
-		ROUND_B(0); \
-		ROUND_B(1); \
-		ROUND_B(2); \
-		ROUND_B(3); \
-		ROUND_B(4); \
-		ROUND_B(5); \
-		ROUND_B(6); \
-		ROUND_B(7); \
-		ROUND_B(8); \
-		ROUND_B(9); \
-		ROUND_B(0); \
-		ROUND_B(1); \
-		ROUND_B(2); \
-		ROUND_B(3); \
-		ROUND_B(4); \
-		ROUND_B(5); \
-		H0 ^= S0 ^ V0 ^ V8; \
-		H1 ^= S1 ^ V1 ^ V9; \
-		H2 ^= S2 ^ V2 ^ VA; \
-		H3 ^= S3 ^ V3 ^ VB; \
-		H4 ^= S0 ^ V4 ^ VC; \
-		H5 ^= S1 ^ V5 ^ VD; \
-		H6 ^= S2 ^ V6 ^ VE; \
-		H7 ^= S3 ^ V7 ^ VF; \
-	} while (0)
-
-#endif
-
-#endif
-
-static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
-
-static void
-blake32_init(sph_blake_small_context *sc,
-	const sph_u32 *iv, const sph_u32 *salt)
-{
-	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
-	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
-	sc->T0 = sc->T1 = 0;
-	sc->ptr = 0;
-}
-
-static void
-blake32(sph_blake_small_context *sc, const void *data, size_t len)
-{
-	unsigned char *buf;
-	size_t ptr;
-	DECL_STATE32
-
-	buf = sc->buf;
-	ptr = sc->ptr;
-	if (len < (sizeof sc->buf) - ptr) {
-		memcpy(buf + ptr, data, len);
-		ptr += len;
-		sc->ptr = ptr;
-		return;
-	}
-
-	READ_STATE32(sc);
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->buf) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(buf + ptr, data, clen);
-		ptr += clen;
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		if (ptr == sizeof sc->buf) {
-			if ((T0 = SPH_T32(T0 + 512)) < 512)
-				T1 = SPH_T32(T1 + 1);
-			COMPRESS32;
-			ptr = 0;
-		}
-	}
-	WRITE_STATE32(sc);
-	sc->ptr = ptr;
-}
-
-static void
-blake32_close(sph_blake_small_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
-{
-	union {
-		unsigned char buf[64];
-		sph_u32 dummy;
-	} u;
-	size_t ptr, k;
-	unsigned bit_len;
-	unsigned z;
-	sph_u32 th, tl;
-	unsigned char *out;
-
-	ptr = sc->ptr;
-	bit_len = ((unsigned)ptr << 3) + n;
-	z = 0x80 >> n;
-	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
-	tl = sc->T0 + bit_len;
-	th = sc->T1;
-	if (ptr == 0 && n == 0) {
-		sc->T0 = SPH_C32(0xFFFFFE00);
-		sc->T1 = SPH_C32(0xFFFFFFFF);
-	} else if (sc->T0 == 0) {
-		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
-		sc->T1 = SPH_T32(sc->T1 - 1);
-	} else {
-		sc->T0 -= 512 - bit_len;
-	}
-	if (bit_len <= 446) {
-		memset(u.buf + ptr + 1, 0, 55 - ptr);
-		if (out_size_w32 == 8)
-			u.buf[55] |= 1;
-		sph_enc32be_aligned(u.buf + 56, th);
-		sph_enc32be_aligned(u.buf + 60, tl);
-		blake32(sc, u.buf + ptr, 64 - ptr);
-	} else {
-		memset(u.buf + ptr + 1, 0, 63 - ptr);
-		blake32(sc, u.buf + ptr, 64 - ptr);
-		sc->T0 = SPH_C32(0xFFFFFE00);
-		sc->T1 = SPH_C32(0xFFFFFFFF);
-		memset(u.buf, 0, 56);
-		if (out_size_w32 == 8)
-			u.buf[55] = 1;
-		sph_enc32be_aligned(u.buf + 56, th);
-		sph_enc32be_aligned(u.buf + 60, tl);
-		blake32(sc, u.buf, 64);
-	}
-	out = (unsigned char *)dst;
-	for (k = 0; k < out_size_w32; k ++)
-		sph_enc32be(out + (k << 2), sc->H[k]);
-}
-
-#if SPH_64
-
-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
-
-static void
-blake64_init(sph_blake_big_context *sc,
-	const sph_u64 *iv, const sph_u64 *salt)
-{
-	memcpy(sc->H, iv, 8 * sizeof(sph_u64));
-	memcpy(sc->S, salt, 4 * sizeof(sph_u64));
-	sc->T0 = sc->T1 = 0;
-	sc->ptr = 0;
-}
-
-static void
-blake64(sph_blake_big_context *sc, const void *data, size_t len)
-{
-	unsigned char *buf;
-	size_t ptr;
-	DECL_STATE64
-
-	buf = sc->buf;
-	ptr = sc->ptr;
-	if (len < (sizeof sc->buf) - ptr) {
-		memcpy(buf + ptr, data, len);
-		ptr += len;
-		sc->ptr = ptr;
-		return;
-	}
-
-	READ_STATE64(sc);
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->buf) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(buf + ptr, data, clen);
-		ptr += clen;
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		if (ptr == sizeof sc->buf) {
-			if ((T0 = SPH_T64(T0 + 1024)) < 1024)
-				T1 = SPH_T64(T1 + 1);
-			COMPRESS64;
-			ptr = 0;
-		}
-	}
-	WRITE_STATE64(sc);
-	sc->ptr = ptr;
-}
-
-static void
-blake64_close(sph_blake_big_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
-{
-	union {
-		unsigned char buf[128];
-		sph_u64 dummy;
-	} u;
-	size_t ptr, k;
-	unsigned bit_len;
-	unsigned z;
-	sph_u64 th, tl;
-	unsigned char *out;
-
-	ptr = sc->ptr;
-	bit_len = ((unsigned)ptr << 3) + n;
-	z = 0x80 >> n;
-	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
-	tl = sc->T0 + bit_len;
-	th = sc->T1;
-	if (ptr == 0 && n == 0) {
-		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
-	} else if (sc->T0 == 0) {
-		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
-		sc->T1 = SPH_T64(sc->T1 - 1);
-	} else {
-		sc->T0 -= 1024 - bit_len;
-	}
-	if (bit_len <= 894) {
-		memset(u.buf + ptr + 1, 0, 111 - ptr);
-		if (out_size_w64 == 8)
-			u.buf[111] |= 1;
-		sph_enc64be_aligned(u.buf + 112, th);
-		sph_enc64be_aligned(u.buf + 120, tl);
-		blake64(sc, u.buf + ptr, 128 - ptr);
-	} else {
-		memset(u.buf + ptr + 1, 0, 127 - ptr);
-		blake64(sc, u.buf + ptr, 128 - ptr);
-		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
-		memset(u.buf, 0, 112);
-		if (out_size_w64 == 8)
-			u.buf[111] = 1;
-		sph_enc64be_aligned(u.buf + 112, th);
-		sph_enc64be_aligned(u.buf + 120, tl);
-		blake64(sc, u.buf, 128);
-	}
-	out = (unsigned char *)dst;
-	for (k = 0; k < out_size_w64; k ++)
-		sph_enc64be(out + (k << 3), sc->H[k]);
-}
-
-#endif
-
-/* see sph_blake.h */
-void
-sph_blake224_init(void *cc)
-{
-	blake32_init(cc, IV224, salt_zero_small);
-}
-
-/* see sph_blake.h */
-void
-sph_blake224(void *cc, const void *data, size_t len)
-{
-	blake32(cc, data, len);
-}
-
-/* see sph_blake.h */
-void
-sph_blake224_close(void *cc, void *dst)
-{
-	sph_blake224_addbits_and_close(cc, 0, 0, dst);
-}
-
-/* see sph_blake.h */
-void
-sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake32_close(cc, ub, n, dst, 7);
-	sph_blake224_init(cc);
-}
-
-/* see sph_blake.h */
-void
-sph_blake256_init(void *cc)
-{
-	blake32_init(cc, IV256, salt_zero_small);
-}
-
-/* see sph_blake.h */
-void
-sph_blake256(void *cc, const void *data, size_t len)
-{
-	blake32(cc, data, len);
-}
-
-/* see sph_blake.h */
-void
-sph_blake256_close(void *cc, void *dst)
-{
-	sph_blake256_addbits_and_close(cc, 0, 0, dst);
-}
-
-/* see sph_blake.h */
-void
-sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake32_close(cc, ub, n, dst, 8);
-	sph_blake256_init(cc);
-}
-
-/* see sph_blake.h */
-void
-sph_blake256_set_rounds(int rounds)
-{
-	blake256_rounds = rounds;
-}
-
-#if SPH_64
-
-/* see sph_blake.h */
-void
-sph_blake384_init(void *cc)
-{
-	blake64_init(cc, IV384, salt_zero_big);
-}
-
-/* see sph_blake.h */
-void
-sph_blake384(void *cc, const void *data, size_t len)
-{
-	blake64(cc, data, len);
-}
-
-/* see sph_blake.h */
-void
-sph_blake384_close(void *cc, void *dst)
-{
-	sph_blake384_addbits_and_close(cc, 0, 0, dst);
-}
-
-/* see sph_blake.h */
-void
-sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_close(cc, ub, n, dst, 6);
-	sph_blake384_init(cc);
-}
-
-/* see sph_blake.h */
-void
-sph_blake512_init(void *cc)
-{
-	blake64_init(cc, IV512, salt_zero_big);
-}
-
-/* see sph_blake.h */
-void
-sph_blake512(void *cc, const void *data, size_t len)
-{
-	blake64(cc, data, len);
-}
-
-/* see sph_blake.h */
-void
-sph_blake512_close(void *cc, void *dst)
-{
-	sph_blake512_addbits_and_close(cc, 0, 0, dst);
-}
-
-/* see sph_blake.h */
-void
-sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_close(cc, ub, n, dst, 8);
-	sph_blake512_init(cc);
-}
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
deleted file mode 100644
index 2c2b3da..0000000
--- a/sph/sph_blake.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
-/**
- * BLAKE interface. BLAKE is a family of functions which differ by their
- * output size; this implementation defines BLAKE for output sizes 224,
- * 256, 384 and 512 bits. This implementation conforms to the "third
- * round" specification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_blake.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_BLAKE_H__
-#define SPH_BLAKE_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "sph_types.h"
-
-/**
- * Output size (in bits) for BLAKE-224.
- */
-#define SPH_SIZE_blake224   224
-
-/**
- * Output size (in bits) for BLAKE-256.
- */
-#define SPH_SIZE_blake256   256
-
-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-384.
- */
-#define SPH_SIZE_blake384   384
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
-#define SPH_SIZE_blake512   512
-
-#endif
-
-/**
- * This structure is a context for BLAKE-224 and BLAKE-256 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BLAKE computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BLAKE
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	size_t ptr;
-	sph_u32 H[8];
-	sph_u32 S[4];
-	sph_u32 T0, T1;
-#endif
-} sph_blake_small_context;
-
-/**
- * This structure is a context for BLAKE-224 computations. It is
- * identical to the common <code>sph_blake_small_context</code>.
- */
-typedef sph_blake_small_context sph_blake224_context;
-
-/**
- * This structure is a context for BLAKE-256 computations. It is
- * identical to the common <code>sph_blake_small_context</code>.
- */
-typedef sph_blake_small_context sph_blake256_context;
-
-#if SPH_64
-
-/**
- * This structure is a context for BLAKE-384 and BLAKE-512 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BLAKE computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BLAKE
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-	sph_u64 H[8];
-	sph_u64 S[4];
-	sph_u64 T0, T1;
-#endif
-} sph_blake_big_context;
-
-/**
- * This structure is a context for BLAKE-384 computations. It is
- * identical to the common <code>sph_blake_small_context</code>.
- */
-typedef sph_blake_big_context sph_blake384_context;
-
-/**
- * This structure is a context for BLAKE-512 computations. It is
- * identical to the common <code>sph_blake_small_context</code>.
- */
-typedef sph_blake_big_context sph_blake512_context;
-
-#endif
-
-/**
- * Initialize a BLAKE-224 context. This process performs no memory allocation.
- *
- * @param cc   the BLAKE-224 context (pointer to a
- *             <code>sph_blake224_context</code>)
- */
-void sph_blake224_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the BLAKE-224 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_blake224(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current BLAKE-224 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (28 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the BLAKE-224 context
- * @param dst   the destination buffer
- */
-void sph_blake224_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (28 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the BLAKE-224 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_blake224_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Switch for the number of rounds (old blake was 8)
- */
-extern int blake256_rounds;
-
-/**
- * Initialize a BLAKE-256 context. This process performs no memory allocation.
- *
- * @param cc   the BLAKE-256 context (pointer to a
- *             <code>sph_blake256_context</code>)
- */
-void sph_blake256_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the BLAKE-256 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_blake256(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current BLAKE-256 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (32 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the BLAKE-256 context
- * @param dst   the destination buffer
- */
-void sph_blake256_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (32 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the BLAKE-256 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_blake256_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Allow blakecoin and blake variants
- */
-void sph_blake256_set_rounds(int rounds);
-
-#if SPH_64
-
-/**
- * Initialize a BLAKE-384 context. This process performs no memory allocation.
- *
- * @param cc   the BLAKE-384 context (pointer to a
- *             <code>sph_blake384_context</code>)
- */
-void sph_blake384_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the BLAKE-384 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_blake384(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current BLAKE-384 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (48 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the BLAKE-384 context
- * @param dst   the destination buffer
- */
-void sph_blake384_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (48 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the BLAKE-384 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_blake384_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize a BLAKE-512 context. This process performs no memory allocation.
- *
- * @param cc   the BLAKE-512 context (pointer to a
- *             <code>sph_blake512_context</code>)
- */
-void sph_blake512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the BLAKE-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_blake512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current BLAKE-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the BLAKE-512 context
- * @param dst   the destination buffer
- */
-void sph_blake512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the BLAKE-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_blake512_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sph/sph_types.h b/sph/sph_types.h
deleted file mode 100644
index 7295b0b..0000000
--- a/sph/sph_types.h
+++ /dev/null
@@ -1,1976 +0,0 @@
-/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
-/**
- * Basic type definitions.
- *
- * This header file defines the generic integer types that will be used
- * for the implementation of hash functions; it also contains helper
- * functions which encode and decode multi-byte integer values, using
- * either little-endian or big-endian conventions.
- *
- * This file contains a compile-time test on the size of a byte
- * (the <code>unsigned char</code> C type). If bytes are not octets,
- * i.e. if they do not have a size of exactly 8 bits, then compilation
- * is aborted. Architectures where bytes are not octets are relatively
- * rare, even in the embedded devices market. We forbid non-octet bytes
- * because there is no clear convention on how octet streams are encoded
- * on such systems.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_types.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_TYPES_H__
-#define SPH_TYPES_H__
-
-#include <limits.h>
-
-/*
- * All our I/O functions are defined over octet streams. We do not know
- * how to handle input data if bytes are not octets.
- */
-#if CHAR_BIT != 8
-#error This code requires 8-bit bytes
-#endif
-
-/* ============= BEGIN documentation block for Doxygen ============ */
-
-#ifdef DOXYGEN_IGNORE
-
-/** @mainpage sphlib C code documentation
- *
- * @section overview Overview
- *
- * <code>sphlib</code> is a library which contains implementations of
- * various cryptographic hash functions. These pages have been generated
- * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
- * document the API for the C implementations.
- *
- * The API is described in appropriate header files, which are available
- * in the "Files" section. Each hash function family has its own header,
- * whose name begins with <code>"sph_"</code> and contains the family
- * name. For instance, the API for the RIPEMD hash functions is available
- * in the header file <code>sph_ripemd.h</code>.
- *
- * @section principles API structure and conventions
- *
- * @subsection io Input/output conventions
- *
- * In all generality, hash functions operate over strings of bits.
- * Individual bits are rarely encountered in C programming or actual
- * communication protocols; most protocols converge on the ubiquitous
- * "octet" which is a group of eight bits. Data is thus expressed as a
- * stream of octets. The C programming language contains the notion of a
- * "byte", which is a data unit managed under the type <code>"unsigned
- * char"</code>. The C standard prescribes that a byte should hold at
- * least eight bits, but possibly more. Most modern architectures, even
- * in the embedded world, feature eight-bit bytes, i.e. map bytes to
- * octets.
- *
- * Nevertheless, for some of the implemented hash functions, an extra
- * API has been added, which allows the input of arbitrary sequences of
- * bits: when the computation is about to be closed, 1 to 7 extra bits
- * can be added. The functions for which this API is implemented include
- * the SHA-2 functions and all SHA-3 candidates.
- *
- * <code>sphlib</code> defines hash function which may hash octet streams,
- * i.e. streams of bits where the number of bits is a multiple of eight.
- * The data input functions in the <code>sphlib</code> API expect data
- * as anonymous pointers (<code>"const void *"</code>) with a length
- * (of type <code>"size_t"</code>) which gives the input data chunk length
- * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
- * header contains a compile-time test which prevents compilation on
- * architectures where this property is not met.
- *
- * The hash function output is also converted into bytes. All currently
- * implemented hash functions have an output width which is a multiple of
- * eight, and this is likely to remain true for new designs.
- *
- * Most hash functions internally convert input data into 32-bit of 64-bit
- * words, using either little-endian or big-endian conversion. The hash
- * output also often consists of such words, which are encoded into output
- * bytes with a similar endianness convention. Some hash functions have
- * been only loosely specified on that subject; when necessary,
- * <code>sphlib</code> has been tested against published "reference"
- * implementations in order to use the same conventions.
- *
- * @subsection shortname Function short name
- *
- * Each implemented hash function has a "short name" which is used
- * internally to derive the identifiers for the functions and context
- * structures which the function uses. For instance, MD5 has the short
- * name <code>"md5"</code>. Short names are listed in the next section,
- * for the implemented hash functions. In subsequent sections, the
- * short name will be assumed to be <code>"XXX"</code>: replace with the
- * actual hash function name to get the C identifier.
- *
- * Note: some functions within the same family share the same core
- * elements, such as update function or context structure. Correspondingly,
- * some of the defined types or functions may actually be macros which
- * transparently evaluate to another type or function name.
- *
- * @subsection context Context structure
- *
- * Each implemented hash fonction has its own context structure, available
- * under the type name <code>"sph_XXX_context"</code> for the hash function
- * with short name <code>"XXX"</code>. This structure holds all needed
- * state for a running hash computation.
- *
- * The contents of these structures are meant to be opaque, and private
- * to the implementation. However, these contents are specified in the
- * header files so that application code which uses <code>sphlib</code>
- * may access the size of those structures.
- *
- * The caller is responsible for allocating the context structure,
- * whether by dynamic allocation (<code>malloc()</code> or equivalent),
- * static allocation (a global permanent variable), as an automatic
- * variable ("on the stack"), or by any other mean which ensures proper
- * structure alignment. <code>sphlib</code> code performs no dynamic
- * allocation by itself.
- *
- * The context must be initialized before use, using the
- * <code>sph_XXX_init()</code> function. This function sets the context
- * state to proper initial values for hashing.
- *
- * Since all state data is contained within the context structure,
- * <code>sphlib</code> is thread-safe and reentrant: several hash
- * computations may be performed in parallel, provided that they do not
- * operate on the same context. Moreover, a running computation can be
- * cloned by copying the context (with a simple <code>memcpy()</code>):
- * the context and its clone are then independant and may be updated
- * with new data and/or closed without interfering with each other.
- * Similarly, a context structure can be moved in memory at will:
- * context structures contain no pointer, in particular no pointer to
- * themselves.
- *
- * @subsection dataio Data input
- *
- * Hashed data is input with the <code>sph_XXX()</code> fonction, which
- * takes as parameters a pointer to the context, a pointer to the data
- * to hash, and the number of data bytes to hash. The context is updated
- * with the new data.
- *
- * Data can be input in one or several calls, with arbitrary input lengths.
- * However, it is best, performance wise, to input data by relatively big
- * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
- * optimize things and avoid internal copying.
- *
- * When all data has been input, the context can be closed with
- * <code>sph_XXX_close()</code>. The hash output is computed and written
- * into the provided buffer. The caller must take care to provide a
- * buffer of appropriate length; e.g., when using SHA-1, the output is
- * a 20-byte word, therefore the output buffer must be at least 20-byte
- * long.
- *
- * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
- * function can be used instead of <code>sph_XXX_close()</code>. This
- * function can take a few extra <strong>bits</strong> to be added at
- * the end of the input message. This allows hashing messages with a
- * bit length which is not a multiple of 8. The extra bits are provided
- * as an unsigned integer value, and a bit count. The bit count must be
- * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
- * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
- * For instance, to add three bits of value 1, 1 and 0, the unsigned
- * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
- * will be 3.
- *
- * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
- * it evaluates to the function output size, expressed in bits. For instance,
- * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
- *
- * When closed, the context is automatically reinitialized and can be
- * immediately used for another computation. It is not necessary to call
- * <code>sph_XXX_init()</code> after a close. Note that
- * <code>sph_XXX_init()</code> can still be called to "reset" a context,
- * i.e. forget previously input data, and get back to the initial state.
- *
- * @subsection alignment Data alignment
- *
- * "Alignment" is a property of data, which is said to be "properly
- * aligned" when its emplacement in memory is such that the data can
- * be optimally read by full words. This depends on the type of access;
- * basically, some hash functions will read data by 32-bit or 64-bit
- * words. <code>sphlib</code> does not mandate such alignment for input
- * data, but using aligned data can substantially improve performance.
- *
- * As a rule, it is best to input data by chunks whose length (in bytes)
- * is a multiple of eight, and which begins at "generally aligned"
- * addresses, such as the base address returned by a call to
- * <code>malloc()</code>.
- *
- * @section functions Implemented functions
- *
- * We give here the list of implemented functions. They are grouped by
- * family; to each family corresponds a specific header file. Each
- * individual function has its associated "short name". Please refer to
- * the documentation for that header file to get details on the hash
- * function denomination and provenance.
- *
- * Note: the functions marked with a '(64)' in the list below are
- * available only if the C compiler provides an integer type of length
- * 64 bits or more. Such a type is mandatory in the latest C standard
- * (ISO 9899:1999, aka "C99") and is present in several older compilers
- * as well, so chances are that such a type is available.
- *
- * - HAVAL family: file <code>sph_haval.h</code>
- *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
- *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
- *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
- *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
- *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
- *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
- *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
- *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
- *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
- *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
- *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
- *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
- *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
- *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
- *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
- * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
- * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
- * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
- * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
- * - RadioGatun family: file <code>sph_radiogatun.h</code>
- *   - RadioGatun[32]: short name: <code>radiogatun32</code>
- *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
- * - RIPEMD family: file <code>sph_ripemd.h</code>
- *   - RIPEMD: short name: <code>ripemd</code>
- *   - RIPEMD-128: short name: <code>ripemd128</code>
- *   - RIPEMD-160: short name: <code>ripemd160</code>
- * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
- * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
- * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
- *   - SHA-224: short name: <code>sha224</code>
- *   - SHA-256: short name: <code>sha256</code>
- *   - SHA-384: short name: <code>sha384</code> (64)
- *   - SHA-512: short name: <code>sha512</code> (64)
- * - Tiger family: file <code>sph_tiger.h</code>
- *   - Tiger: short name: <code>tiger</code> (64)
- *   - Tiger2: short name: <code>tiger2</code> (64)
- * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
- *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
- *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
- *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
- *
- * The fourteen second-round SHA-3 candidates are also implemented;
- * when applicable, the implementations follow the "final" specifications
- * as published for the third round of the SHA-3 competition (BLAKE,
- * Groestl, JH, Keccak and Skein have been tweaked for third round).
- *
- * - BLAKE family: file <code>sph_blake.h</code>
- *   - BLAKE-224: short name: <code>blake224</code>
- *   - BLAKE-256: short name: <code>blake256</code>
- *   - BLAKE-384: short name: <code>blake384</code>
- *   - BLAKE-512: short name: <code>blake512</code>
- * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
- *   - BMW-224: short name: <code>bmw224</code>
- *   - BMW-256: short name: <code>bmw256</code>
- *   - BMW-384: short name: <code>bmw384</code> (64)
- *   - BMW-512: short name: <code>bmw512</code> (64)
- * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
- *   CubeHash16/32 in the CubeHash specification)
- *   - CubeHash-224: short name: <code>cubehash224</code>
- *   - CubeHash-256: short name: <code>cubehash256</code>
- *   - CubeHash-384: short name: <code>cubehash384</code>
- *   - CubeHash-512: short name: <code>cubehash512</code>
- * - ECHO family: file <code>sph_echo.h</code>
- *   - ECHO-224: short name: <code>echo224</code>
- *   - ECHO-256: short name: <code>echo256</code>
- *   - ECHO-384: short name: <code>echo384</code>
- *   - ECHO-512: short name: <code>echo512</code>
- * - Fugue family: file <code>sph_fugue.h</code>
- *   - Fugue-224: short name: <code>fugue224</code>
- *   - Fugue-256: short name: <code>fugue256</code>
- *   - Fugue-384: short name: <code>fugue384</code>
- *   - Fugue-512: short name: <code>fugue512</code>
- * - Groestl family: file <code>sph_groestl.h</code>
- *   - Groestl-224: short name: <code>groestl224</code>
- *   - Groestl-256: short name: <code>groestl256</code>
- *   - Groestl-384: short name: <code>groestl384</code>
- *   - Groestl-512: short name: <code>groestl512</code>
- * - Hamsi family: file <code>sph_hamsi.h</code>
- *   - Hamsi-224: short name: <code>hamsi224</code>
- *   - Hamsi-256: short name: <code>hamsi256</code>
- *   - Hamsi-384: short name: <code>hamsi384</code>
- *   - Hamsi-512: short name: <code>hamsi512</code>
- * - JH family: file <code>sph_jh.h</code>
- *   - JH-224: short name: <code>jh224</code>
- *   - JH-256: short name: <code>jh256</code>
- *   - JH-384: short name: <code>jh384</code>
- *   - JH-512: short name: <code>jh512</code>
- * - Keccak family: file <code>sph_keccak.h</code>
- *   - Keccak-224: short name: <code>keccak224</code>
- *   - Keccak-256: short name: <code>keccak256</code>
- *   - Keccak-384: short name: <code>keccak384</code>
- *   - Keccak-512: short name: <code>keccak512</code>
- * - Luffa family: file <code>sph_luffa.h</code>
- *   - Luffa-224: short name: <code>luffa224</code>
- *   - Luffa-256: short name: <code>luffa256</code>
- *   - Luffa-384: short name: <code>luffa384</code>
- *   - Luffa-512: short name: <code>luffa512</code>
- * - Shabal family: file <code>sph_shabal.h</code>
- *   - Shabal-192: short name: <code>shabal192</code>
- *   - Shabal-224: short name: <code>shabal224</code>
- *   - Shabal-256: short name: <code>shabal256</code>
- *   - Shabal-384: short name: <code>shabal384</code>
- *   - Shabal-512: short name: <code>shabal512</code>
- * - SHAvite-3 family: file <code>sph_shavite.h</code>
- *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
- *     short name: <code>shabal224</code>
- *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
- *     short name: <code>shabal256</code>
- *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
- *     short name: <code>shabal384</code>
- *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
- *     short name: <code>shabal512</code>
- * - SIMD family: file <code>sph_simd.h</code>
- *   - SIMD-224: short name: <code>simd224</code>
- *   - SIMD-256: short name: <code>simd256</code>
- *   - SIMD-384: short name: <code>simd384</code>
- *   - SIMD-512: short name: <code>simd512</code>
- * - Skein family: file <code>sph_skein.h</code>
- *   - Skein-224 (nominally specified as Skein-512-224): short name:
- *     <code>skein224</code> (64)
- *   - Skein-256 (nominally specified as Skein-512-256): short name:
- *     <code>skein256</code> (64)
- *   - Skein-384 (nominally specified as Skein-512-384): short name:
- *     <code>skein384</code> (64)
- *   - Skein-512 (nominally specified as Skein-512-512): short name:
- *     <code>skein512</code> (64)
- *
- * For the second-round SHA-3 candidates, the functions are as specified
- * for round 2, i.e. with the "tweaks" that some candidates added
- * between round 1 and round 2. Also, some of the submitted packages for
- * round 2 contained errors, in the specification, reference code, or
- * both. <code>sphlib</code> implements the corrected versions.
- */
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 32 bits; on most
- * architectures, it will have a width of exactly 32 bits. Unsigned C
- * types implement arithmetics modulo a power of 2; use the
- * <code>SPH_T32()</code> macro to ensure that the value is truncated
- * to exactly 32 bits. Unless otherwise specified, all macros and
- * functions which accept <code>sph_u32</code> values assume that these
- * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
- * where <code>sph_u32</code> is larger than that.
- */
-typedef __arch_dependant__ sph_u32;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u32</code>; it has
- * width 32 bits or more.
- */
-typedef __arch_dependant__ sph_s32;
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 64 bits; on most
- * architectures which feature such a type, it will have a width of
- * exactly 64 bits. C99-compliant platform will have this type; it
- * is also defined when the GNU compiler (gcc) is used, and on
- * platforms where <code>unsigned long</code> is large enough. If this
- * type is not available, then some hash functions which depends on
- * a 64-bit type will not be available (most notably SHA-384, SHA-512,
- * Tiger and WHIRLPOOL).
- */
-typedef __arch_dependant__ sph_u64;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u64</code>; it has
- * width 64 bits or more.
- */
-typedef __arch_dependant__ sph_s64;
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u32</code>. Depending on
- * how this type is defined, a suffix such as <code>UL</code> may
- * be appended to the argument.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C32(x)
-
-/**
- * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
- * a no-op, recognized as such by the compiler.
- *
- * @param x   the value to truncate (of type <code>sph_u32</code>)
- */
-#define SPH_T32(x)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTL32(x, n)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTR32(x, n)
-
-/**
- * This macro is defined on systems for which a 64-bit type has been
- * detected, and is used for <code>sph_u64</code>.
- */
-#define SPH_64
-
-/**
- * This macro is defined on systems for the "native" integer size is
- * 64 bits (64-bit values fit in one register).
- */
-#define SPH_64_TRUE
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u64</code>. Depending on
- * how this type is defined, a suffix such as <code>ULL</code> may
- * be appended to the argument. This macro is defined only if a
- * 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C64(x)
-
-/**
- * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
- * a no-op, recognized as such by the compiler. This macro is defined only
- * if a 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the value to truncate (of type <code>sph_u64</code>)
- */
-#define SPH_T64(x)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTL64(x, n)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTR64(x, n)
-
-/**
- * This macro evaluates to <code>inline</code> or an equivalent construction,
- * if available on the compilation platform, or to nothing otherwise. This
- * is used to declare inline functions, for which the compiler should
- * endeavour to include the code directly in the caller. Inline functions
- * are typically defined in header files as replacement for macros.
- */
-#define SPH_INLINE
-
-/**
- * This macro is defined if the platform has been detected as using
- * little-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_LITTLE_ENDIAN
-
-/**
- * This macro is defined if the platform has been detected as using
- * big-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_BIG_ENDIAN
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in little-endian
- * convention. This is the case for little-endian platforms, and also
- * for the big-endian platforms which have special little-endian access
- * opcodes (e.g. Ultrasparc).
- */
-#define SPH_LITTLE_FAST
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in big-endian
- * convention. This is the case for little-endian platforms, and also
- * for the little-endian platforms which have special big-endian access
- * opcodes.
- */
-#define SPH_BIG_FAST
-
-/**
- * On some platforms, this macro is defined to an unsigned integer type
- * into which pointer values may be cast. The resulting value can then
- * be tested for being a multiple of 2, 4 or 8, indicating an aligned
- * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
- */
-#define SPH_UPTR
-
-/**
- * When defined, this macro indicates that unaligned memory accesses
- * are possible with only a minor penalty, and thus should be prefered
- * over strategies which first copy data to an aligned buffer.
- */
-#define SPH_UNALIGNED
-
-/**
- * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
- * <code>0x78563412</code>). This is an inline function which resorts
- * to inline assembly on some platforms, for better performance.
- *
- * @param x   the 32-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u32 sph_bswap32(sph_u32 x);
-
-/**
- * Byte-swap a 64-bit word. This is an inline function which resorts
- * to inline assembly on some platforms, for better performance. This
- * function is defined only if a suitable 64-bit type was found for
- * <code>sph_u64</code>
- *
- * @param x   the 64-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u64 sph_bswap64(sph_u64 x);
-
-/**
- * Decode a 16-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16le(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16le(void *dst, unsigned val);
-
-/**
- * Decode a 16-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16be(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16be(void *dst, unsigned val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32le()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32le()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32be()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32be()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64le()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64le()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64be()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64be()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
-
-#endif
-
-/* ============== END documentation block for Doxygen ============= */
-
-#ifndef DOXYGEN_IGNORE
-
-/*
- * We want to define the types "sph_u32" and "sph_u64" which hold
- * unsigned values of at least, respectively, 32 and 64 bits. These
- * tests should select appropriate types for most platforms. The
- * macro "SPH_64" is defined if the 64-bit is supported.
- */
-
-#undef SPH_64
-#undef SPH_64_TRUE
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
- * type, if any, or otherwise use a wider type (which must exist, for
- * C99 conformance).
- */
-
-#include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t sph_u32;
-typedef int32_t sph_s32;
-#else
-typedef uint_fast32_t sph_u32;
-typedef int_fast32_t sph_s32;
-#endif
-#if !SPH_NO_64
-#ifdef UINT64_MAX
-typedef uint64_t sph_u64;
-typedef int64_t sph_s64;
-#else
-typedef uint_fast64_t sph_u64;
-typedef int_fast64_t sph_s64;
-#endif
-#endif
-
-#define SPH_C32(x)    ((sph_u32)(x))
-#if !SPH_NO_64
-#define SPH_C64(x)    ((sph_u64)(x))
-#define SPH_64  1
-#endif
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int sph_u32;
-typedef int sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## U))
-
-#else
-
-typedef unsigned long sph_u32;
-typedef long sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## UL))
-
-#endif
-
-#if !SPH_NO_64
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long sph_u64;
-typedef long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## UL))
-
-#define SPH_64  1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long sph_u64;
-typedef long long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## ULL))
-
-#define SPH_64  1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-#endif
-
-/*
- * If the "unsigned long" type has length 64 bits or more, then this is
- * a "true" 64-bit architectures. This is also true with Visual C on
- * amd64, even though the "long" type is limited to 32 bits.
- */
-#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
-#define SPH_64_TRUE   1
-#endif
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
-
-#if SPH_64
-
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
-
-#endif
-
-#ifndef DOXYGEN_IGNORE
-/*
- * Define SPH_INLINE to be an "inline" qualifier, if available. We define
- * some small macro-like functions which benefit greatly from being inlined.
- */
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
-#define SPH_INLINE inline
-#elif defined _MSC_VER
-#define SPH_INLINE __inline
-#else
-#define SPH_INLINE
-#endif
-#endif
-
-/*
- * We define some macros which qualify the architecture. These macros
- * may be explicit set externally (e.g. as compiler parameters). The
- * code below sets those macros if they are not already defined.
- *
- * Most macros are boolean, thus evaluate to either zero or non-zero.
- * The SPH_UPTR macro is special, in that it evaluates to a C type,
- * or is not defined.
- *
- * SPH_UPTR             if defined: unsigned type to cast pointers into
- *
- * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
- * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
- * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
- * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
- * SPH_BIG_FAST         non-zero if big-endian decoding is fast
- *
- * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
- * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
- * _must_ be non-zero in those situations. The 32-bit and 64-bit types
- * _must_ also have an exact width.
- *
- * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
- * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
- * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
- * SPH_I386_GCC         x86-compatible (32-bit) with gcc
- * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
- * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
- * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
- * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
- * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
- *
- * TODO: enhance automatic detection, for more architectures and compilers.
- * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
- * some very fast functions (e.g. MD4) when using unaligned input data.
- * The CPU-specific-with-GCC macros are useful only for inline assembly,
- * normally restrained to this header file.
- */
-
-/*
- * 32-bit x86, aka "i386 compatible".
- */
-#if defined __i386__ || defined _M_IX86
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u32
-#ifdef __GNUC__
-#define SPH_DETECT_I386_GCC          1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_I386_MSVC         1
-#endif
-
-/*
- * 64-bit x86, hereafter known as "amd64".
- */
-#elif defined __x86_64 || defined _M_X64
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_AMD64_GCC         1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_AMD64_MSVC        1
-#endif
-
-/*
- * 64-bit Sparc architecture (implies v9).
- */
-#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
-	|| defined __sparcv9
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_SPARCV9_GCC_64    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * 32-bit Sparc.
- */
-#elif (defined __sparc__ || defined __sparc) \
-	&& !(defined __sparcv9 || defined __arch64__)
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u32
-#if defined __GNUC__ && defined __sparc_v9__
-#define SPH_DETECT_SPARCV9_GCC_32    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * ARM, little-endian.
- */
-#elif defined __arm__ && __ARMEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, little-endian.
- */
-#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, big-endian.
- */
-#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
-
-#define SPH_DETECT_BIG_ENDIAN        1
-
-/*
- * PowerPC.
- */
-#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
-	|| defined _ARCH_PPC
-
-/*
- * Note: we do not declare cross-endian access to be "fast": even if
- * using inline assembly, implementation should still assume that
- * keeping the decoded word in a temporary is faster than decoding
- * it again.
- */
-#if defined __GNUC__
-#if SPH_64_TRUE
-#define SPH_DETECT_PPC64_GCC         1
-#else
-#define SPH_DETECT_PPC32_GCC         1
-#endif
-#endif
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-
-/*
- * Itanium, 64-bit.
- */
-#elif defined __ia64 || defined __ia64__ \
-	|| defined __itanium__ || defined _M_IA64
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#else
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-#if defined __LP64__ || defined _LP64
-#define SPH_DETECT_UPTR              sph_u64
-#else
-#define SPH_DETECT_UPTR              sph_u32
-#endif
-
-#endif
-
-#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
-#define SPH_DETECT_SPARCV9_GCC       1
-#endif
-
-#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
-#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
-#endif
-#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
-#define SPH_UPTR              SPH_DETECT_UPTR
-#endif
-#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
-#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
-#endif
-#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
-#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
-#endif
-#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
-#endif
-#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
-#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
-#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
-#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
-#endif
-#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
-#define SPH_I386_GCC          SPH_DETECT_I386_GCC
-#endif
-#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
-#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
-#endif
-#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
-#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
-#endif
-#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
-#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
-#endif
-#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
-#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
-#endif
-#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
-#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
-#endif
-
-#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST              1
-#endif
-#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST                 1
-#endif
-
-#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
-#error SPH_UPTR defined, but endianness is not known.
-#endif
-
-#if SPH_I386_GCC && !SPH_NO_ASM
-
-/*
- * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
-#elif SPH_AMD64_GCC && !SPH_NO_ASM
-
-/*
- * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * and 64-bit values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#endif
-
-/*
- * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
- * to generate proper opcodes for endianness swapping with the pure C
- * implementation below.
- *
-
-#elif SPH_I386_MSVC && !SPH_NO_ASM
-
-static __inline sph_u32 __declspec(naked) __fastcall
-sph_bswap32(sph_u32 x)
-{
-	__asm {
-		bswap  ecx
-		mov    eax,ecx
-		ret
-	}
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
- *
- * [end of disabled code]
- */
-
-#else
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	x = SPH_T32((x << 16) | (x >> 16));
-	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
-		| ((x & SPH_C32(0x00FF00FF)) << 8);
-	return x;
-}
-
-#if SPH_64
-
-/**
- * Byte-swap a 64-bit value.
- *
- * @param x   the input value
- * @return  the byte-swapped value
- */
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	x = SPH_T64((x << 32) | (x >> 32));
-	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
-		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
-	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
-		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
-	return x;
-}
-
-#endif
-
-#endif
-
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-
-/*
- * On UltraSPARC systems, native ordering is big-endian, but it is
- * possible to perform little-endian read accesses by specifying the
- * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
- * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
- * contains the source address and %dst is the destination register,
- * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
- * to get the address space name. The latter format is better since it
- * combines an addition and the actual access in a single opcode; but
- * it requires the setting (and subsequent resetting) of %asi, which is
- * slow. Some operations (i.e. MD5 compression function) combine many
- * successive little-endian read accesses, which may share the same
- * %asi setting. The macros below contain the appropriate inline
- * assembly.
- */
-
-#define SPH_SPARCV9_SET_ASI   \
-	sph_u32 sph_sparcv9_asi; \
-	__asm__ __volatile__ ( \
-		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_RESET_ASI  \
-	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
-		sph_u32 sph_sparcv9_tmp; \
-		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
-			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
-		sph_sparcv9_tmp; \
-	})
-
-#endif
-
-static SPH_INLINE void
-sph_enc16be(void *dst, unsigned val)
-{
-	((unsigned char *)dst)[0] = (val >> 8);
-	((unsigned char *)dst)[1] = val;
-}
-
-static SPH_INLINE unsigned
-sph_dec16be(const void *src)
-{
-	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
-		| (unsigned)(((const unsigned char *)src)[1]);
-}
-
-static SPH_INLINE void
-sph_enc16le(void *dst, unsigned val)
-{
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = val >> 8;
-}
-
-static SPH_INLINE unsigned
-sph_dec16le(const void *src)
-{
-	return (unsigned)(((const unsigned char *)src)[0])
-		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32be(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	val = sph_bswap32(val);
-#endif
-	*(sph_u32 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-		val = sph_bswap32(val);
-#endif
-		*(sph_u32 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = (val >> 24);
-		((unsigned char *)dst)[1] = (val >> 16);
-		((unsigned char *)dst)[2] = (val >> 8);
-		((unsigned char *)dst)[3] = val;
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = (val >> 24);
-	((unsigned char *)dst)[1] = (val >> 16);
-	((unsigned char *)dst)[2] = (val >> 8);
-	((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32be_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u32 *)dst = sph_bswap32(val);
-#elif SPH_BIG_ENDIAN
-	*(sph_u32 *)dst = val;
-#else
-	((unsigned char *)dst)[0] = (val >> 24);
-	((unsigned char *)dst)[1] = (val >> 16);
-	((unsigned char *)dst)[2] = (val >> 8);
-	((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#else
-	return *(const sph_u32 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-		return sph_bswap32(*(const sph_u32 *)src);
-#else
-		return *(const sph_u32 *)src;
-#endif
-	} else {
-		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-			| (sph_u32)(((const unsigned char *)src)[3]);
-	}
-#endif
-#else
-	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-		| (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#elif SPH_BIG_ENDIAN
-	return *(const sph_u32 *)src;
-#else
-	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-		| (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32le(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	val = sph_bswap32(val);
-#endif
-	*(sph_u32 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_BIG_ENDIAN
-		val = sph_bswap32(val);
-#endif
-		*(sph_u32 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = val;
-		((unsigned char *)dst)[1] = (val >> 8);
-		((unsigned char *)dst)[2] = (val >> 16);
-		((unsigned char *)dst)[3] = (val >> 24);
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32le_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u32 *)dst = val;
-#elif SPH_BIG_ENDIAN
-	*(sph_u32 *)dst = sph_bswap32(val);
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#else
-	return *(const sph_u32 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-		sph_u32 tmp;
-
-		/*
-		 * "__volatile__" is needed here because without it,
-		 * gcc-3.4.3 miscompiles the code and performs the
-		 * access before the test on the address, thus triggering
-		 * a bus error...
-		 */
-		__asm__ __volatile__ (
-			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-		return tmp;
-/*
- * On PowerPC, this turns out not to be worth the effort: the inline
- * assembly makes GCC optimizer uncomfortable, which tends to nullify
- * the decoding gains.
- *
- * For most hash functions, using this inline assembly trick changes
- * hashing speed by less than 5% and often _reduces_ it. The biggest
- * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
- * less then 10%. The speed gain on CubeHash is probably due to the
- * chronic shortage of registers that CubeHash endures; for the other
- * functions, the generic code appears to be efficient enough already.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-		sph_u32 tmp;
-
-		__asm__ __volatile__ (
-			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-		return tmp;
- */
-#else
-		return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-		return *(const sph_u32 *)src;
-#endif
-	} else {
-		return (sph_u32)(((const unsigned char *)src)[0])
-			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-	}
-#endif
-#else
-	return (sph_u32)(((const unsigned char *)src)[0])
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return *(const sph_u32 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-	sph_u32 tmp;
-
-	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-	return tmp;
-/*
- * Not worth it generally.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-	sph_u32 tmp;
-
-	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-	return tmp;
- */
-#else
-	return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-	return (sph_u32)(((const unsigned char *)src)[0])
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-#if SPH_64
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64be(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	val = sph_bswap64(val);
-#endif
-	*(sph_u64 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-		val = sph_bswap64(val);
-#endif
-		*(sph_u64 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = (val >> 56);
-		((unsigned char *)dst)[1] = (val >> 48);
-		((unsigned char *)dst)[2] = (val >> 40);
-		((unsigned char *)dst)[3] = (val >> 32);
-		((unsigned char *)dst)[4] = (val >> 24);
-		((unsigned char *)dst)[5] = (val >> 16);
-		((unsigned char *)dst)[6] = (val >> 8);
-		((unsigned char *)dst)[7] = val;
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = (val >> 56);
-	((unsigned char *)dst)[1] = (val >> 48);
-	((unsigned char *)dst)[2] = (val >> 40);
-	((unsigned char *)dst)[3] = (val >> 32);
-	((unsigned char *)dst)[4] = (val >> 24);
-	((unsigned char *)dst)[5] = (val >> 16);
-	((unsigned char *)dst)[6] = (val >> 8);
-	((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64be_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u64 *)dst = sph_bswap64(val);
-#elif SPH_BIG_ENDIAN
-	*(sph_u64 *)dst = val;
-#else
-	((unsigned char *)dst)[0] = (val >> 56);
-	((unsigned char *)dst)[1] = (val >> 48);
-	((unsigned char *)dst)[2] = (val >> 40);
-	((unsigned char *)dst)[3] = (val >> 32);
-	((unsigned char *)dst)[4] = (val >> 24);
-	((unsigned char *)dst)[5] = (val >> 16);
-	((unsigned char *)dst)[6] = (val >> 8);
-	((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#else
-	return *(const sph_u64 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-		return sph_bswap64(*(const sph_u64 *)src);
-#else
-		return *(const sph_u64 *)src;
-#endif
-	} else {
-		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-			| (sph_u64)(((const unsigned char *)src)[7]);
-	}
-#endif
-#else
-	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-		| (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#elif SPH_BIG_ENDIAN
-	return *(const sph_u64 *)src;
-#else
-	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-		| (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64le(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	val = sph_bswap64(val);
-#endif
-	*(sph_u64 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_BIG_ENDIAN
-		val = sph_bswap64(val);
-#endif
-		*(sph_u64 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = val;
-		((unsigned char *)dst)[1] = (val >> 8);
-		((unsigned char *)dst)[2] = (val >> 16);
-		((unsigned char *)dst)[3] = (val >> 24);
-		((unsigned char *)dst)[4] = (val >> 32);
-		((unsigned char *)dst)[5] = (val >> 40);
-		((unsigned char *)dst)[6] = (val >> 48);
-		((unsigned char *)dst)[7] = (val >> 56);
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-	((unsigned char *)dst)[4] = (val >> 32);
-	((unsigned char *)dst)[5] = (val >> 40);
-	((unsigned char *)dst)[6] = (val >> 48);
-	((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64le_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u64 *)dst = val;
-#elif SPH_BIG_ENDIAN
-	*(sph_u64 *)dst = sph_bswap64(val);
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-	((unsigned char *)dst)[4] = (val >> 32);
-	((unsigned char *)dst)[5] = (val >> 40);
-	((unsigned char *)dst)[6] = (val >> 48);
-	((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#else
-	return *(const sph_u64 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-		sph_u64 tmp;
-
-		__asm__ __volatile__ (
-			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-		return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-		return (sph_u64)sph_dec32le_aligned(src)
-			| ((sph_u64)sph_dec32le_aligned(
-				(const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-		sph_u64 tmp;
-
-		__asm__ __volatile__ (
-			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-		return tmp;
- */
-#else
-		return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-		return *(const sph_u64 *)src;
-#endif
-	} else {
-		return (sph_u64)(((const unsigned char *)src)[0])
-			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-	}
-#endif
-#else
-	return (sph_u64)(((const unsigned char *)src)[0])
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return *(const sph_u64 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-	sph_u64 tmp;
-
-	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-	return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-	return (sph_u64)sph_dec32le_aligned(src)
-		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-	sph_u64 tmp;
-
-	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-	return tmp;
- */
-#else
-	return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-	return (sph_u64)(((const unsigned char *)src)[0])
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-#endif
-
-#endif /* Doxygen excluded block */
-
-#endif

From 9e1209a868d969514c3ac837f4deea94efb4f53b Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Mon, 4 Sep 2023 14:13:06 -0300
Subject: [PATCH 106/150] cuda: Perform autocalibration of the grid size.

The size of the job sent to the GPU is gridsize * threadcount. The
thread count can be roughly predetermined, based on the properties of
the kernel and the currently available device families. This is
currently set by default as 256.

Autocalibration is done in a similar fashion to the OpenCL one, by
determining the size where each job takes about 500ms.
---
 config.go           |  67 ++++++++++++++++++++
 cudevice.go         | 148 ++++++++++++++++++++++++++++++++++++++++----
 sample-gominer.conf |  17 +++++
 3 files changed, 221 insertions(+), 11 deletions(-)

diff --git a/config.go b/config.go
index 07c00b7..5b28b0c 100644
--- a/config.go
+++ b/config.go
@@ -26,6 +26,11 @@ const (
 	defaultLogDirname     = "logs"
 	defaultLogFilename    = "gominer.log"
 	defaultClKernel       = "blake3.cl"
+
+	// defaultCudaThreadCount is the default number of threads to execute
+	// in a CUDA batch job. This has been empirically determined to be
+	// a reasonable default in current Nvidia hardware.
+	defaultCudaThreadCount = "256"
 )
 
 var (
@@ -94,6 +99,12 @@ type config struct {
 	WorkSize          string `short:"W" long:"worksize" description:"The explicitly declared sizes of the work to do per device (overrides intensity). Single global value or a comma separated list."`
 	WorkSizeInts      []uint32
 
+	// CUDA-related config parameters.
+	CudaGridSize        string `long:"cudagridsize" description:"Size of the CUDA grid to use per device. Single global value or a comma separated list"`
+	CudaGridSizeInts    []int
+	CudaThreadCount     string `long:"cudathreadcount" description:"Number of CUDA threads to use per device. Single global value or a comma separated list"`
+	CudaThreadCountInts []int
+
 	// Pool related options
 	Pool         string `short:"o" long:"pool" description:"Pool to connect to (e.g.stratum+tcp://pool:port)"`
 	PoolUser     string `short:"m" long:"pooluser" description:"Pool username"`
@@ -219,6 +230,48 @@ func cleanAndExpandPath(path string) string {
 	return filepath.Clean(os.ExpandEnv(path))
 }
 
+// commaListToInts converts a (possibly) comma separated string-encoded ints
+// into a slice of ints.
+func commaListToInts(s string) ([]int, error) {
+	if len(s) == 0 {
+		return nil, nil
+	}
+
+	// Parse a list like "29,30"
+	var res []int
+	if strings.Contains(s, ",") {
+		split := strings.Split(s, ",")
+		res = make([]int, len(split))
+		for i := range split {
+			j, err := strconv.Atoi(split[i])
+			if err != nil {
+				err := fmt.Errorf("item %q is not an int: %v"+
+					split[i], err)
+				return nil, err
+			}
+			res[i] = j
+		}
+	} else {
+		i, err := strconv.Atoi(s)
+		if err != nil {
+			return nil, fmt.Errorf("%q is not an int: %v", s, err)
+		}
+		res = []int{i}
+	}
+
+	return res, nil
+}
+
+// ithOrFirstInt returns s[index] if len(s) > index or s[0] if not.
+//
+//nolint:unused
+func ithOrFirstInt(s []int, index int) int {
+	if index < len(s) {
+		return s[index]
+	}
+	return s[0]
+}
+
 // loadConfig initializes and parses the config using a config file and command
 // line options.
 //
@@ -240,6 +293,8 @@ func loadConfig() (*config, []string, error) {
 		RPCServer:  defaultRPCServer,
 		RPCCert:    defaultRPCCertFile,
 		ClKernel:   defaultClKernel,
+
+		CudaThreadCount: defaultCudaThreadCount,
 	}
 
 	// Create the home directory if it doesn't already exist.
@@ -543,6 +598,18 @@ func loadConfig() (*config, []string, error) {
 		}
 	}
 
+	cfg.CudaGridSizeInts, err = commaListToInts(cfg.CudaGridSize)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Cannot convert CUDA grid size to int: %v\n", err)
+		return nil, nil, err
+	}
+
+	cfg.CudaThreadCountInts, err = commaListToInts(cfg.CudaThreadCount)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Cannot convert CUDA thread count to int: %v\n", err)
+		return nil, nil, err
+	}
+
 	// Special show command to list supported subsystems and exit.
 	if cfg.DebugLevel == "show" {
 		fmt.Println("Supported subsystems", supportedSubsystems())
diff --git a/cudevice.go b/cudevice.go
index b6438f8..347649b 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -12,6 +12,8 @@ import "C"
 
 import (
 	"fmt"
+	"math"
+	"math/bits"
 	"reflect"
 	"runtime"
 	"sync"
@@ -27,12 +29,6 @@ import (
 )
 
 const (
-	// threadsPerBlock is the nb of CUDA threads per processing block.
-	threadsPerBlock = 1024
-
-	// dimGrid is the nb of CUDA blocks to issue.
-	dimGrid = 65504
-
 	// maxOutputNbs is the max number of individual output results. MUST
 	// match what is defined in decred.cu.
 	maxOutputResults = 32
@@ -62,7 +58,9 @@ type Device struct {
 	tempTarget               uint32
 
 	// Items for CUDA device
-	cuDeviceID cu.Device
+	cuDeviceID    cu.Device
+	cuThreadCount uint32
+	cuGridSize    uint32
 
 	// extraNonce is the device extraNonce, where the first
 	// byte is the device ID (supporting up to 255 devices)
@@ -260,10 +258,39 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 		}
 	}
 
-	d.started = uint32(time.Now().Unix())
+	// Use the max nb of threads by default.
+	threadCount := uint32(devProps.MaxThreadsPerBlock)
+	if len(cfg.CudaThreadCountInts) > 0 {
+		threadCount = uint32(ithOrFirstInt(cfg.CudaThreadCountInts, order))
+		if threadCount > uint32(devProps.MaxThreadsPerBlock) {
+			return nil, fmt.Errorf("specified CUDA thread count %d "+
+				"greater than maximum allowed by device #%d (%d)",
+				threadCount, deviceID, devProps.MaxThreadsPerBlock)
+		}
+	}
 
-	// Autocalibrate?
+	// Autocalibrate the desired grid size for the device.
+	var gridSize uint32
+	autocalibrate := len(cfg.CudaGridSize) == 0
+	if autocalibrate {
+		var err error
+		calibrateTime := ithOrFirstInt(cfg.AutocalibrateInts, order)
+		gridSize, err = d.calcGridSizeForMilliseconds(calibrateTime, threadCount)
+		if err != nil {
+			return nil, err
+		}
 
+		minrLog.Infof("Autocalibration successful, grid size for %v"+
+			"ms per kernel execution on device %v with %d threads "+
+			"determined to be %v",
+			calibrateTime, d.index, threadCount, gridSize)
+	} else {
+		gridSize = uint32(ithOrFirstInt(cfg.CudaGridSizeInts, order))
+	}
+
+	d.cuGridSize = gridSize
+	d.cuThreadCount = threadCount
+	d.started = uint32(time.Now().Unix())
 	return d, nil
 }
 
@@ -315,7 +342,6 @@ func (d *Device) runDevice() error {
 	nonceResultsD := cu.Malloc(maxOutputResults * WORDSZ)
 	defer cu.MemFreeHost(nonceResultsH)
 	defer nonceResultsD.Free()
-
 	nonceResultsHSliceHeader := reflect.SliceHeader{
 		Data: uintptr(nonceResultsH),
 		Len:  int(maxOutputResults),
@@ -356,7 +382,7 @@ func (d *Device) runDevice() error {
 
 		// Execute the kernel and follow its execution time.
 		currentTime := time.Now()
-		decredBlake3Hash(dimGrid, threadsPerBlock, midstateH, lastBlockH, nonceResultsD)
+		decredBlake3Hash(d.cuGridSize, d.cuThreadCount, midstateH, lastBlockH, nonceResultsD)
 
 		// Copy results back from device to host.
 		cu.MemcpyDtoH(nonceResultsH, nonceResultsD, maxOutputResults*WORDSZ)
@@ -382,6 +408,106 @@ func (d *Device) runDevice() error {
 	}
 }
 
+// getKernelExecutionTime returns the kernel execution time for a device.
+func (d *Device) getKernelExecutionTime(gridSize, threadCount uint32) (time.Duration,
+	error) {
+
+	const WORDSZ = 4 // Everything is sent as uint32.
+
+	// Setup input buffers.
+	midstateSz := int64(len(d.midstate) * WORDSZ)
+	midstateH := cu.MallocHost(midstateSz)
+	defer cu.MemFreeHost(midstateH)
+	midstateHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(midstateH),
+		Len:  int(midstateSz),
+		Cap:  int(midstateSz),
+	}
+	midstateHSlice := *(*[]uint32)(unsafe.Pointer(&midstateHSliceHeader))
+
+	lastBlockSz := int64(len(d.lastBlock) * WORDSZ)
+	lastBlockH := cu.MallocHost(lastBlockSz)
+	defer cu.MemFreeHost(lastBlockH)
+	lastBlockHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(lastBlockH),
+		Len:  int(lastBlockSz),
+		Cap:  int(lastBlockSz),
+	}
+	lastBlockHSlice := *(*[]uint32)(unsafe.Pointer(&lastBlockHSliceHeader))
+
+	// Setup output buffer.
+	nonceResultsH := cu.MallocHost(maxOutputResults * WORDSZ)
+	nonceResultsD := cu.Malloc(maxOutputResults * WORDSZ)
+	defer cu.MemFreeHost(nonceResultsH)
+	defer nonceResultsD.Free()
+	nonceResultsHSliceHeader := reflect.SliceHeader{
+		Data: uintptr(nonceResultsH),
+		Len:  int(maxOutputResults),
+		Cap:  int(maxOutputResults),
+	}
+	nonceResultsHSlice := *(*[]uint32)(unsafe.Pointer(&nonceResultsHSliceHeader))
+
+	// Clear the results buffer.
+	nonceResultsHSlice[0] = 0
+	cu.MemcpyHtoD(nonceResultsD, nonceResultsH, maxOutputResults*WORDSZ)
+
+	// Copy data into the input buffers.
+	copy(midstateHSlice, d.midstate[:])
+	copy(lastBlockHSlice, d.lastBlock[:])
+
+	// Execute the kernel and follow its execution time.
+	currentTime := time.Now()
+	decredBlake3Hash(gridSize, threadCount, midstateH, lastBlockH, nonceResultsD)
+	cu.MemcpyDtoH(nonceResultsH, nonceResultsD, maxOutputResults*WORDSZ)
+	elapsedTime := time.Since(currentTime)
+	minrLog.Tracef("DEV #%d: Kernel execution to read time for work "+
+		"size calibration: %v", d.index, elapsedTime)
+
+	return elapsedTime, nil
+}
+
+// calcWorkSizeForMilliseconds calculates the correct worksize to achieve
+// a device execution cycle of the passed duration in milliseconds.
+func (d *Device) calcGridSizeForMilliseconds(ms int, threadCount uint32) (uint32, error) {
+	gridSize := uint32(32)
+	timeToAchieve := time.Duration(ms) * time.Millisecond
+	for {
+		execTime, err := d.getKernelExecutionTime(gridSize, threadCount)
+		if err != nil {
+			return 0, err
+		}
+
+		// If we fail to go above the desired execution time, double
+		// the grid size and try again.
+		if execTime < timeToAchieve && gridSize < 1<<30 {
+			gridSize <<= 1
+			continue
+		}
+
+		// The lastest call passed the desired execution time, so now
+		// calculate what the ideal work size should be.
+		adj := float64(gridSize) * (float64(timeToAchieve) / float64(execTime))
+		adj /= 256.0
+		adjMultiple256 := uint32(math.Ceil(adj))
+		gridSize = adjMultiple256 * 256
+
+		// Clamp the gridsize if it will cause the nonce to overflow an
+		// uint32 (allowing this would cause duplicated hashing effort).
+		if bits.Len32(threadCount-1)+bits.Len32(gridSize-1) > 32 {
+			gridSize = 1 << (32 - bits.Len32(threadCount-1))
+		}
+
+		// Size it to the nearest multiple of 32 for best CUDA performance.
+		gridSize = gridSize - (gridSize % 32)
+		if gridSize < 32 {
+			gridSize = 32
+		}
+
+		break
+	}
+
+	return gridSize, nil
+}
 func newMinerDevs(m *Miner) (*Miner, int, error) {
 	deviceListIndex := 0
 	deviceListEnabledCount := 0
diff --git a/sample-gominer.conf b/sample-gominer.conf
index 630fd8e..47ef214 100644
--- a/sample-gominer.conf
+++ b/sample-gominer.conf
@@ -129,6 +129,23 @@
 ; Password for mining pool.
 ; poolpass=
 
+
+; ------------------------------------------------------------------------------
+; CUDA-related settings.
+; ------------------------------------------------------------------------------
+; Size of the grid to use when sending a job to the GPU. If unspecified, gominer
+; will perform autocalibration during startup to size the grid so that each job
+; takes around 500ms to perform.
+; This may also be specified as a comma separated list of values to use when
+; more than one device is available.
+; cudagridsize=
+
+; Number of threads to use when sending a job to the GPU. If unspecified, a
+; default value of 256 threads per block is used.
+; This may also be specified as a comma separated list of values to use when
+; more than one device is available.
+; cudathreadcount=256
+
 ; ------------------------------------------------------------------------------
 ; Experimental settings
 ; Settings in this section are new and/or dangerous and have the potential to

From 7267317a431768fb7f2a827c7939e99795a40428 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 12:21:18 -0500
Subject: [PATCH 107/150] Switch Windows OpenCL build instructions to msys2.

---
 README.md | 81 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index afb9aad..ca3a27b 100644
--- a/README.md
+++ b/README.md
@@ -118,23 +118,54 @@ go build -tags opencladl
 
 ### Windows
 
-#### Pre-Requisites
-
-- Download and install the official Go Windows binaries from [https://golang.dl/](https://golang.org/dl/)
-- Download and install Git for Windows from [https://git-for-windows.github.io/](https://git-for-windows.github.io/)
-  * Make sure to select the Git-Bash option when prompted
-- Download the MinGW-w64 installer from [https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win32/Personal Builds/mingw-builds/installer/](https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/)
-  * Select the x64 toolchain and use defaults for the other questions
-- `git clone https://github.com/decred/gominer`
+#### OpenCL Build Instructions (Works with Both NVIDIA and AMD)
+
+##### OpenCL Pre-Requisites
+
+- Download and install [MSYS2](https://www.msys2.org/)
+  - Make sure you uncheck `Run MSYS2 now.`
+- Launch the `MSYS2 MINGW64` shell from the start menu
+  - NOTE: The `MSYS2` installer will launch the `UCRT64` shell by default if
+    you didn't uncheck `Run MSYS2 now` as instructed.  That shell will not work,
+    so close it if you forgot to uncheck it in the installer.
+- From within the `MSYS2 MINGW64` shell enter the following commands to install
+  `gcc`, `git`, `go`, `unzip`, and the `gominer` source code
+  - `pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-tools mingw-w64-x86_64-go git unzip`
+  - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
+  - `unzip -d /c/appsdk lightOCLSDK.zip`
+  - `git clone https://github.com/decred/gominer`
+- **Close the `MSYS2 MINGW64` and relaunch it**
+  - NOTE: This is necessary to ensure all of the new environment variables are set properly
+- Go to the appropriate section for either NVIDIA or AMD depending on which type of GPU you have
+
+###### OpenCL with NVIDIA
+
+- Build gominer
+  - `cd ~/gominer`
+  - `go build -tags opencl`
+- Test `gominer` detects your GPU
+  - `./gominer -l`
+
+##### OpenCL with AMD
 
-#### Build Instructions
+- Change to the library directory C:\appsdk\lib\x86_64
+  * `cd /c/appsdk/lib/x86_64`
+- Copy and prepare the ADL library for linking
+  - `cp /c/Windows/SysWOW64/atiadlxx.dll .`
+  - `gendef atiadlxx.dll`
+  - `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
+- Build gominer
+  - `cd ~/gominer`
+  - `go build -tags opencl`
+- Test `gominer` detects your GPU
+  - `./gominer -l`
 
-##### CUDA
+#### CUDA Build Instructions (NVIDIA only)
 
-**NOTE**: The CUDA version of the Blake3 gominer is not yet compatible to
-windows.
+**NOTE**: The CUDA version of the Blake3 gominer is not yet compatible with
+Windows.
 
-###### Pre-Requisites
+##### Pre-Requisites
 
 - Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)
 - Add `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin` to your PATH
@@ -148,27 +179,3 @@ windows.
 - Copy dependencies:
   * ```copy obj/decred.dll .```
   * ```copy nvidia/NVSMI/nvml.dll .```
-
-##### OpenCL/ADL
-
-###### Pre-Requisites
-
-- Download OpenCL SDK from [https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases/tag/1.0](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases/tag/1.0)
-- Unzip or untar the downloaded `lightOCLSDK` archive to `C:\appsdk`
-  * Ensure the folders `C:\appsdk\include` and `C:\appsdk\lib` are populated
-- Change to the library directory C:\appsdk\lib\x86_64
-  * `cd /D C:\appsdk\lib\x86_64`
-- Copy and prepare the ADL library for linking
-  * `copy c:\Windows\SysWOW64\atiadlxx.dll .`
-  * `gendef atiadlxx.dll`
-  * `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
-
-###### Steps
-
-- For OpenCL:
-  * `cd gominer`
-  * `go build -tags opencl`
-
-- For OpenCL with AMD Device Library (ADL) support:
-  * `cd gominer`
-  * `go build -tags opencladl`

From cfac81684dc90e4a00e8b888397837d99aa5fbee Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 12:46:02 -0500
Subject: [PATCH 108/150] Correct a couple of README typos.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ca3a27b..c2c6ea6 100644
--- a/README.md
+++ b/README.md
@@ -134,11 +134,11 @@ go build -tags opencladl
   - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
   - `unzip -d /c/appsdk lightOCLSDK.zip`
   - `git clone https://github.com/decred/gominer`
-- **Close the `MSYS2 MINGW64` and relaunch it**
+- **Close the `MSYS2 MINGW64` shell and relaunch it**
   - NOTE: This is necessary to ensure all of the new environment variables are set properly
 - Go to the appropriate section for either NVIDIA or AMD depending on which type of GPU you have
 
-###### OpenCL with NVIDIA
+##### OpenCL with NVIDIA
 
 - Build gominer
   - `cd ~/gominer`

From abdcf6963ec55710528a451886e67d4da4051e83 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 16:30:44 -0500
Subject: [PATCH 109/150] Add OpenCL AMD/NVIDIA Linux Build Instructions.

---
 README.md | 121 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 74 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index c2c6ea6..81c2149 100644
--- a/README.md
+++ b/README.md
@@ -76,45 +76,87 @@ $ curl http://localhost:3333/
 
 ### Linux
 
-#### Pre-Requisites
+#### Preliminaries
 
-You will either need to install CUDA for NVIDIA graphics cards or OpenCL
-library/headers that support your device such as: AMDGPU-PRO (for newer AMD
-cards), Beignet (for Intel Graphics), or Catalyst (for older AMD cards).
+Gominer works with both OpenCL (both AMD and NVIDIA) and CUDA (NVIDIA only).
+At the current time, most users have reported that OpenCL gives them higher
+hashrates on NVIDIA.
 
-For example, on Ubuntu 23.04 you can install the necessary OpenCL packages (for
-Intel Graphics) and CUDA libraries with:
+**NOTE: Although gominer works with CUDA, there are not any build instructions
+yet.  The will be provided at a later date**.
 
-```
-sudo apt-get install nvidia-cuda-dev nvidia-cuda-toolkit
-```
+Once you decide on OpenCL or CUDA, you will need to install the
+graphics driver for your GPU as well as the headers for OpenCL or CUDA
+depending on your choice.
 
-gominer has been built successfully on Ubuntu 23.04 with go1.21.0,
-g++ 5.4.0 although other combinations should work as well.
+The exact packages are dependent on the specific Linux distribution, but,
+generally speaking, you will need the latest AMDGPU-PRO display drivers for
+AMD cards and the latest NVIDIA graphics display drivers for NVIDIA cards.
 
-#### Instructions
+You will also need the OpenCL headers which is typically named something
+similar to `mesa-opencl-dev` (for AMD) or `nvidia-opencl-dev` for NVIDIA.
 
-To download and build gominer, run:
+If you're using OpenCL, it is also recommended to install your distribution's
+equivalent of the `clinfo` package if you have any issues to ensure your
+device can be detected by OpenCL.
 
-```
-git clone https://github.com/decred/gominer
-cd gominer
-```
+The following sections provide instructions for the following combinations:
 
-For CUDA with NVIDIA Management Library (NVML) support:
-```
-make
-```
+* OpenCL for NVIDIA on Ubuntu 23.04
+* OpenCL for AMD on Debian Bookworm
 
-For OpenCL (autodetects AMDGPU support):
-```
-go build -tags opencl
-```
+#### OpenCL Build Instructions (Works with Both NVIDIA and AMD)
 
-For OpenCL with AMD Device Library (ADL) support:
-```
-go build -tags opencladl
-```
+##### OpenCL with NVIDIA on Ubuntu 23.04
+
+- Detect the model of your NVIDIA GPU and the recommended driver
+  - `ubuntu-drivers devices`
+- Install the NVIDIA graphics driver
+  - **If you agree with the recommended drivers**
+    - `sudo ubuntu-drivers autoinstall`
+  - **Alternatively, install a specific driver (forr example)**
+    - `sudo apt install nvidia-driver-525-server`
+- Reboot to allow the graphics driver to load
+  - `sudo reboot`
+- Install the OpenCL headers, `git` adnd `go`
+  - `sudo apt install nvidia-opencl-dev git golang`
+- Obtain the `gominer` source code
+  - `git clone https://github.com/decred/gominer`
+- Build `gominer`
+  - `cd gominer`
+  - `go build -tags opencl`
+- Test `gominer` detects your GPU
+  - `./gominer -l`
+
+##### OpenCL with AMD on Debian Bookworm
+
+- Enable the non-free (closed source) repository by using your favorite editor
+  to modify `/etc/apt/sources.list` and appending `contrib non-free` to the
+  `deb` respoitory
+  - `$EDITOR /etc/apt/sources.list``
+    - It should look similar to the following
+      ```
+      deb http://ftp.us.debian.org/debian bookworm-updates main contrib non-free
+      deb http://security.debian.org bookworm-security main contrib non-free
+      ```
+- Update the Apt package manager with the new sources
+  - `apt update`
+- Install the AMD graphics driver and supporting firmware
+  - `apt install firmware-linux firmware-linux-nonfree libdrm-amdgpu1 xserver-xorg-video-amdgpu`
+- Install the OpenCL headers, `git` adnd `go`
+  - `sudo apt install mesa-opencl-dev git golang`
+- Obtain the `gominer` source code
+  - `git clone https://github.com/decred/gominer`
+- Build `gominer`
+  - `cd gominer`
+  - `go build -tags opencl`
+- Test `gominer` detects your GPU
+  - `./gominer -l`
+
+#### CUDA Build Instructions (NVIDIA only)
+
+**Build instructions are not available yet.  They will be provided at a later
+date**.
 
 ### Windows
 
@@ -129,7 +171,7 @@ go build -tags opencladl
     you didn't uncheck `Run MSYS2 now` as instructed.  That shell will not work,
     so close it if you forgot to uncheck it in the installer.
 - From within the `MSYS2 MINGW64` shell enter the following commands to install
-  `gcc`, `git`, `go`, `unzip`, and the `gominer` source code
+  `gcc`, `git`, `go`, `unzip`, the light OpenCL SDK, and the `gominer` source code
   - `pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-tools mingw-w64-x86_64-go git unzip`
   - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
   - `unzip -d /c/appsdk lightOCLSDK.zip`
@@ -162,20 +204,5 @@ go build -tags opencladl
 
 #### CUDA Build Instructions (NVIDIA only)
 
-**NOTE**: The CUDA version of the Blake3 gominer is not yet compatible with
-Windows.
-
-##### Pre-Requisites
-
-- Download Microsoft Visual Studio 2013 from [https://www.microsoft.com/en-us/download/details.aspx?id=44914](https://www.microsoft.com/en-us/download/details.aspx?id=44914)
-- Add `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin` to your PATH
-- Install CUDA 7.0 from [https://developer.nvidia.com/cuda-toolkit-70](https://developer.nvidia.com/cuda-toolkit-70)
-- Add `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\bin` to your PATH
-
-###### Steps
-- Using git-bash:
-  * ```cd $GOPATH/src/github.com/decred/gominer```
-  * ```mingw32-make.exe```
-- Copy dependencies:
-  * ```copy obj/decred.dll .```
-  * ```copy nvidia/NVSMI/nvml.dll .```
+**NOTE**: The CUDA version of the `gominer` is not yet compatible with
+Windows.
\ No newline at end of file

From d725cda10e79b05246def1e38215582cc2c83366 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 18:36:10 -0500
Subject: [PATCH 110/150] README: Add some configuraiton details.

Also clean up a few other minor things.
---
 README.md | 86 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 81c2149..11e6cb3 100644
--- a/README.md
+++ b/README.md
@@ -6,28 +6,58 @@ and stratum/pool mining using OpenCL devices.
 
 ## Downloading
 
-Linux and Windows 64-bit binaries may be downloaded from:
-
-[https://github.com/decred/decred-binaries/releases/latest](https://github.com/decred/decred-binaries/releases/latest)
+Binaries are not currently available.  See the [Building](#building)
+([Windows](#windows), [Linux](#linux)) section for details on how to build
+`gominer` from source.
+
+## Configuring `gominer`
+
+`gominer` needs to acquire work in order to have something to solve.  Currently, the only supported method is solo mining via a `dcrd` RPC server.  There are plans to support [dcrpool](https://github.com/decred/dcrpool) for pooled mining in the future.
+
+In order to communicate with the `dcrd` RPC server, `gominer` must be configured
+with `dcrd`'s RPC server credentials.
+
+- Obtain the RPC username and password by finding the `rpcuser` and `rpcpass`
+  entries in the `dcrd.conf` file
+  - Windows: `%LOCALAPPDATA%\Dcrd\dcrd.conf`
+  - Linux: `~/.dcrd/dcrd.conf`
+  - MacOs: `~/Library/Application Support/Dcrd/dcrd.conf`
+- Create a `gominer.conf` file at the platform-specific path that contains the
+  **exact same** `rpcuser=` and `rpcpass=` lines you obtained from the
+  `dcrd.conf` file in the previous step
+  - Windows: `%LOCALAPPDATA%\Gominer\gominer.conf`
+  - Linux: `~/.gominer/gominer.conf`
+  - MacOS: `~/Library/Application Support/Gominer/gominer.conf`
+  - The `gominer.conf` config file should have at least the following lines:
+  ```
+  rpcuser=<same rpcuser from dcrd.conf>
+  rpcpass=<same rpcpass from dcrd.conf>
+  ```
+
+Next, `dcrd` must be configured with a mining address to send the payment for
+mined blocks.  That is accomplished by either launching `dcrd` with the
+`--miningaddr=Ds...` CLI flag or adding a `miningaddr=Ds...` to the
+aforementioned `dcrd.conf` file and restarting `dcrd`.
 
 ## Running
 
-Benchmark mode:
+### Benchmark mode
 
-```
-gominer -B
-```
-
-Solo mining on mainnet using dcrd running on the local host:
+`gominer` provides a benchmark mode where no work is submitted in order to test
+your setup.
 
 ```
-gominer -u myusername -P hunter2
+./gominer -B
 ```
 
-Stratum/pool mining:
+### Solo Mining on Mainnet
+
+Ensure you have [configured](#configuring-gominer) `gominer` with `dcrd`'s RPC
+credentials as well as `dcrd` with a `miningaddr`.  Once the credentials and
+mining address have been configured, simply run gominer to begin mining.
 
 ```
-gominer -o stratum+tcp://pool:port -m username -n password
+./gominer
 ```
 
 ## Status API
@@ -94,16 +124,17 @@ generally speaking, you will need the latest AMDGPU-PRO display drivers for
 AMD cards and the latest NVIDIA graphics display drivers for NVIDIA cards.
 
 You will also need the OpenCL headers which is typically named something
-similar to `mesa-opencl-dev` (for AMD) or `nvidia-opencl-dev` for NVIDIA.
+similar to `mesa-opencl-dev` (for AMD) or `nvidia-opencl-dev` (for NVIDIA).
 
 If you're using OpenCL, it is also recommended to install your distribution's
-equivalent of the `clinfo` package if you have any issues to ensure your
-device can be detected by OpenCL.
+equivalent of the `clinfo` package if you have any issues to ensure your device
+can be detected by OpenCL.  When `clinfo` is unable to detect your device,
+`gominer` will not be able to either.
 
 The following sections provide instructions for the following combinations:
 
-* OpenCL for NVIDIA on Ubuntu 23.04
-* OpenCL for AMD on Debian Bookworm
+* [OpenCL for NVIDIA on Ubuntu 23.04](#opencl-with-nvidia-on-ubuntu-2304)
+* [OpenCL for AMD on Debian Bookworm](#opencl-with-amd-on-debian-bookworm)
 
 #### OpenCL Build Instructions (Works with Both NVIDIA and AMD)
 
@@ -114,7 +145,7 @@ The following sections provide instructions for the following combinations:
 - Install the NVIDIA graphics driver
   - **If you agree with the recommended drivers**
     - `sudo ubuntu-drivers autoinstall`
-  - **Alternatively, install a specific driver (forr example)**
+  - **Alternatively, install a specific driver (for example)**
     - `sudo apt install nvidia-driver-525-server`
 - Reboot to allow the graphics driver to load
   - `sudo reboot`
@@ -125,15 +156,16 @@ The following sections provide instructions for the following combinations:
 - Build `gominer`
   - `cd gominer`
   - `go build -tags opencl`
-- Test `gominer` detects your GPU
+- Test `gominer` detects your GPU(s)
   - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 ##### OpenCL with AMD on Debian Bookworm
 
 - Enable the non-free (closed source) repository by using your favorite editor
   to modify `/etc/apt/sources.list` and appending `contrib non-free` to the
   `deb` respoitory
-  - `$EDITOR /etc/apt/sources.list``
+  - `$EDITOR /etc/apt/sources.list`
     - It should look similar to the following
       ```
       deb http://ftp.us.debian.org/debian bookworm-updates main contrib non-free
@@ -150,8 +182,9 @@ The following sections provide instructions for the following combinations:
 - Build `gominer`
   - `cd gominer`
   - `go build -tags opencl`
-- Test `gominer` detects your GPU
+- Test `gominer` detects your GPU(s)
   - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 #### CUDA Build Instructions (NVIDIA only)
 
@@ -178,29 +211,32 @@ date**.
   - `git clone https://github.com/decred/gominer`
 - **Close the `MSYS2 MINGW64` shell and relaunch it**
   - NOTE: This is necessary to ensure all of the new environment variables are set properly
-- Go to the appropriate section for either NVIDIA or AMD depending on which type of GPU you have
+- Jump to the appropriate section for either [NVIDIA](#opencl-with-nvidia) or
+  [AMD](#opencl-with-amd) depending on which type of GPU you have
 
 ##### OpenCL with NVIDIA
 
 - Build gominer
   - `cd ~/gominer`
   - `go build -tags opencl`
-- Test `gominer` detects your GPU
+- Test `gominer` detects your GPU(s)
   - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 ##### OpenCL with AMD
 
 - Change to the library directory C:\appsdk\lib\x86_64
   * `cd /c/appsdk/lib/x86_64`
-- Copy and prepare the ADL library for linking
+- Copy and prepare the AMD Display Library (ADL) for linking
   - `cp /c/Windows/SysWOW64/atiadlxx.dll .`
   - `gendef atiadlxx.dll`
   - `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
 - Build gominer
   - `cd ~/gominer`
   - `go build -tags opencl`
-- Test `gominer` detects your GPU
+- Test `gominer` detects your GPU(s)
   - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 #### CUDA Build Instructions (NVIDIA only)
 

From 99e749db61df0715fcc7b37689df3db44f350647 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 18:36:10 -0500
Subject: [PATCH 111/150] README: Fix a few typos.

Also, no pool support at the moment.
---
 README.md | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 11e6cb3..f940568 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,9 @@
 # gominer
 
 gominer is an application for performing Proof-of-Work (PoW) mining on the
-Decred network after the activation of DCP0011 using BLAKE3.  It supports solo
-and stratum/pool mining using OpenCL devices.
+Decred network after the activation of
+[DCP0011](https://github.com/decred/dcps/blob/master/dcp-0011/dcp-0011.mediawiki)
+using BLAKE3.  It supports solo mining using OpenCL and CUDA devices.
 
 ## Downloading
 
@@ -108,12 +109,12 @@ $ curl http://localhost:3333/
 
 #### Preliminaries
 
-Gominer works with both OpenCL (both AMD and NVIDIA) and CUDA (NVIDIA only).
-At the current time, most users have reported that OpenCL gives them higher
-hashrates on NVIDIA.
+Gominer works with OpenCL (both AMD and NVIDIA) and CUDA (NVIDIA only).  At the
+current time, most users have reported that OpenCL gives them higher hashrates
+on NVIDIA.
 
 **NOTE: Although gominer works with CUDA, there are not any build instructions
-yet.  The will be provided at a later date**.
+yet.  They will be provided at a later date**.
 
 Once you decide on OpenCL or CUDA, you will need to install the
 graphics driver for your GPU as well as the headers for OpenCL or CUDA
@@ -131,7 +132,7 @@ equivalent of the `clinfo` package if you have any issues to ensure your device
 can be detected by OpenCL.  When `clinfo` is unable to detect your device,
 `gominer` will not be able to either.
 
-The following sections provide instructions for the following combinations:
+The following sections provide instructions for these combinations:
 
 * [OpenCL for NVIDIA on Ubuntu 23.04](#opencl-with-nvidia-on-ubuntu-2304)
 * [OpenCL for AMD on Debian Bookworm](#opencl-with-amd-on-debian-bookworm)
@@ -149,7 +150,7 @@ The following sections provide instructions for the following combinations:
     - `sudo apt install nvidia-driver-525-server`
 - Reboot to allow the graphics driver to load
   - `sudo reboot`
-- Install the OpenCL headers, `git` adnd `go`
+- Install the OpenCL headers, `git` and `go`
   - `sudo apt install nvidia-opencl-dev git golang`
 - Obtain the `gominer` source code
   - `git clone https://github.com/decred/gominer`
@@ -164,7 +165,7 @@ The following sections provide instructions for the following combinations:
 
 - Enable the non-free (closed source) repository by using your favorite editor
   to modify `/etc/apt/sources.list` and appending `contrib non-free` to the
-  `deb` respoitory
+  `deb` repository
   - `$EDITOR /etc/apt/sources.list`
     - It should look similar to the following
       ```
@@ -172,10 +173,10 @@ The following sections provide instructions for the following combinations:
       deb http://security.debian.org bookworm-security main contrib non-free
       ```
 - Update the Apt package manager with the new sources
-  - `apt update`
+  - `sudo apt update`
 - Install the AMD graphics driver and supporting firmware
-  - `apt install firmware-linux firmware-linux-nonfree libdrm-amdgpu1 xserver-xorg-video-amdgpu`
-- Install the OpenCL headers, `git` adnd `go`
+  - `sudo apt install firmware-linux firmware-linux-nonfree libdrm-amdgpu1 xserver-xorg-video-amdgpu`
+- Install the OpenCL headers, `git` and `go`
   - `sudo apt install mesa-opencl-dev git golang`
 - Obtain the `gominer` source code
   - `git clone https://github.com/decred/gominer`

From 107c435206fb09e5be8a1571001069eb33ac8200 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Thu, 7 Sep 2023 14:35:03 +0100
Subject: [PATCH 112/150] stratum: Don't reverse Version bytes.

Unsure why this reversing was in place, but the testnet3 value of
0b000000 was causing the decode to fail. With this change it correctly
decodes to 11.
---
 stratum/stratum.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stratum/stratum.go b/stratum/stratum.go
index 4ea8231..2646d6c 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -870,11 +870,11 @@ func (s *Stratum) PrepWork() error {
 
 	// Serialize header.
 	bh := wire.BlockHeader{}
-	v, err := util.ReverseToInt(s.PoolWork.Version)
+	v, err := hex.DecodeString(s.PoolWork.Version)
 	if err != nil {
 		return err
 	}
-	bh.Version = v
+	bh.Version = int32(binary.LittleEndian.Uint32(v))
 
 	nbits, err := hex.DecodeString(s.PoolWork.Nbits)
 	if err != nil {

From 293bbb5ee64ec6e0121e64da40091fcf2b09bff8 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Thu, 7 Sep 2023 14:38:54 +0100
Subject: [PATCH 113/150] util: Fix RevHash func.

Previous version was not working properly. New one is validated with a
test.
---
 util/util.go      | 15 +++++++--------
 util/util_test.go | 24 ++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 8 deletions(-)
 create mode 100644 util/util_test.go

diff --git a/util/util.go b/util/util.go
index d520aaa..a7f71c4 100644
--- a/util/util.go
+++ b/util/util.go
@@ -45,15 +45,14 @@ func ReverseToInt(s string) (int32, error) {
 
 // RevHash reverses a hash in string format.
 func RevHash(hash string) string {
-	revHash := ""
-	for i := 0; i < 7; i++ {
-		j := i * 8
-		part := fmt.Sprintf("%c%c%c%c%c%c%c%c",
-			hash[6+j], hash[7+j], hash[4+j], hash[5+j],
-			hash[2+j], hash[3+j], hash[0+j], hash[1+j])
-		revHash += part
+	rev := []rune(hash)
+	for i := 0; i <= len(rev)/2-2; i += 2 {
+		opp := len(rev) - 2 - i
+		rev[i], rev[opp] = rev[opp], rev[i]
+		rev[i+1], rev[opp+1] = rev[opp+1], rev[i+1]
 	}
-	return revHash
+
+	return string(rev)
 }
 
 // DiffToTarget converts a whole number difficulty into a target.
diff --git a/util/util_test.go b/util/util_test.go
new file mode 100644
index 0000000..6d02db5
--- /dev/null
+++ b/util/util_test.go
@@ -0,0 +1,24 @@
+package util
+
+import "testing"
+
+func TestRevHash(t *testing.T) {
+	tests := []struct {
+		name string
+		hash string
+		want string
+	}{
+		{
+			"ok",
+			"f82aadcc5f683978c0cf7b616b41f956bb6c4bbd073e93ccffb868972e000000",
+			"0000002e9768b8ffcc933e07bd4b6cbb56f9416b617bcfc07839685fccad2af8",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := RevHash(tt.hash); got != tt.want {
+				t.Errorf("RevHash() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}

From 8f2e03c1755223471f8c4928b2995fc4e6997c28 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Wed, 6 Sep 2023 23:25:08 -0500
Subject: [PATCH 114/150] README: Add user reported hashrates.

---
 README.md | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f940568..9d09bc9 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@ Decred network after the activation of
 [DCP0011](https://github.com/decred/dcps/blob/master/dcp-0011/dcp-0011.mediawiki)
 using BLAKE3.  It supports solo mining using OpenCL and CUDA devices.
 
+[User Reported Hashrates](#user-reported-hashrates)
+
 ## Downloading
 
 Binaries are not currently available.  See the [Building](#building)
@@ -242,4 +244,25 @@ date**.
 #### CUDA Build Instructions (NVIDIA only)
 
 **NOTE**: The CUDA version of the `gominer` is not yet compatible with
-Windows.
\ No newline at end of file
+Windows.
+
+## User Reported Hashrates
+
+### OpenCL
+
+GPU                    | Hashrate
+-----------------------|---------
+NVIDIA GTX 1060        | 3.0 Gh/s
+AMD RX 580             | 3.7 Gh/s
+NVIDIA 1660 Super      | 5.0 Gh/s
+AMD Vega 56            | 7.0 Gh/s
+NVIDIA RTX 3060 Ti     | 8.7 Gh/s
+NVIDIA GTX 3080 Mobile | 9.4 Gh/s
+NVIDIA RTX 3070        | 10.1 Gh/s
+NVIDIA RTX 2080        | 10.4 Gh/s
+NVIDIA Tesla V100      | 13.9 Gh/s
+NVIDIA Tesla V100S     | 14.6 Gh/s
+NVIDIA RTX 4070        | 14.9 Gh/s
+NVIDIA RTX 3080        | 15.2 Gh/s
+NVIDIA RTX 3090        | 17.6 Gh/s
+AMD 7900 XTX           | 27.2 Gh/s
\ No newline at end of file

From 718d9fffa6d1d5b2119c4a78ed8c7e3dc3d8a141 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Thu, 7 Sep 2023 00:17:24 -0500
Subject: [PATCH 115/150] README: Correct Debian Bookworm build instructions.

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9d09bc9..c2bf2e7 100644
--- a/README.md
+++ b/README.md
@@ -178,8 +178,10 @@ The following sections provide instructions for these combinations:
   - `sudo apt update`
 - Install the AMD graphics driver and supporting firmware
   - `sudo apt install firmware-linux firmware-linux-nonfree libdrm-amdgpu1 xserver-xorg-video-amdgpu`
-- Install the OpenCL headers, `git` and `go`
-  - `sudo apt install mesa-opencl-dev git golang`
+- Install the OpenCL headers, OpenCL Installable Client driver, OpenCL lib, `git` and `go`
+  - `sudo apt install opencl-headers mesa-opencl-icd ocl-icd-libopencl1 git golang`
+- Help the loader find the OpenCL library by creating a symbolic link to it:
+  - `ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/libOpenCL.so`
 - Obtain the `gominer` source code
   - `git clone https://github.com/decred/gominer`
 - Build `gominer`

From 46e4f80cd2f64232af897c7c5d25a2dc2b3072d0 Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Mon, 11 Sep 2023 11:40:45 -0400
Subject: [PATCH 116/150] Setup a signal context and use it. (#207)

---
 cldevice.go       | 11 ++++-----
 cudevice.go       | 10 ++++----
 device.go         | 20 +++++++---------
 main.go           | 11 ++-------
 miner.go          | 42 ++++++++++++++--------------------
 signal.go         | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 signal_syscall.go | 15 ++++++++++++
 7 files changed, 109 insertions(+), 58 deletions(-)
 create mode 100644 signal.go
 create mode 100644 signal_syscall.go

diff --git a/cldevice.go b/cldevice.go
index 74d5d35..825971c 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -8,6 +8,7 @@ package main
 import (
 	"bufio"
 	"bytes"
+	"context"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -244,8 +245,6 @@ type Device struct {
 	allDiffOneShares uint64
 	validShares      uint64
 	invalidShares    uint64
-
-	quit chan struct{}
 }
 
 // If the device order and OpenCL index are ever not the same then we can
@@ -389,7 +388,6 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 		deviceID:    deviceID,
 		deviceName:  getDeviceInfo(deviceID, cl.CL_DEVICE_NAME, "CL_DEVICE_NAME"),
 		deviceType:  getDeviceInfo(deviceID, cl.CL_DEVICE_TYPE, "CL_DEVICE_TYPE"),
-		quit:        make(chan struct{}),
 		newWork:     make(chan *work.Work, 5),
 		workDone:    workDone,
 		fanPercent:  0,
@@ -588,7 +586,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	return d, nil
 }
 
-func (d *Device) runDevice() error {
+func (d *Device) runDevice(ctx context.Context) error {
 	minrLog.Infof("Started DEV #%d: %s", d.index, d.deviceName)
 	outputData := make([]uint32, outputBufferSize)
 
@@ -600,11 +598,12 @@ func (d *Device) runDevice() error {
 	}
 
 	var status cl.CL_int
+	ctxDoneCh := ctx.Done()
 	for {
-		d.updateCurrentWork()
+		d.updateCurrentWork(ctx)
 
 		select {
-		case <-d.quit:
+		case <-ctxDoneCh:
 			return nil
 		default:
 		}
diff --git a/cudevice.go b/cudevice.go
index 347649b..90ae14e 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -82,8 +82,6 @@ type Device struct {
 	allDiffOneShares uint64
 	validShares      uint64
 	invalidShares    uint64
-
-	quit chan struct{}
 }
 
 func decredBlake3Hash(dimgrid, threads uint32, midstate, lastblock unsafe.Pointer, out cu.DevicePtr) {
@@ -199,7 +197,6 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 		deviceType:  DeviceTypeGPU,
 		cuda:        true,
 		kind:        DeviceKindNVML,
-		quit:        make(chan struct{}),
 		newWork:     make(chan *work.Work, 5),
 		workDone:    workDone,
 		fanPercent:  0,
@@ -294,7 +291,7 @@ func NewCuDevice(index int, order int, deviceID cu.Device,
 	return d, nil
 }
 
-func (d *Device) runDevice() error {
+func (d *Device) runDevice(ctx context.Context) error {
 	// Initialize the nonces for the device such that each device in the same
 	// system is doing different work while also helping prevent collisions
 	// across multiple processes and systems working on the same template.
@@ -350,11 +347,12 @@ func (d *Device) runDevice() error {
 	nonceResultsHSlice := *(*[]uint32)(unsafe.Pointer(&nonceResultsHSliceHeader))
 
 	// Mining loop.
+	ctxDoneCh := ctx.Done()
 	for {
-		d.updateCurrentWork()
+		d.updateCurrentWork(ctx)
 
 		select {
-		case <-d.quit:
+		case <-ctxDoneCh():
 			return nil
 		default:
 		}
diff --git a/device.go b/device.go
index 0e72d44..a7ecd54 100644
--- a/device.go
+++ b/device.go
@@ -3,6 +3,7 @@
 package main
 
 import (
+	"context"
 	"crypto/rand"
 	"encoding/binary"
 	"fmt"
@@ -96,7 +97,7 @@ func (d *Device) initNonces() error {
 	return nil
 }
 
-func (d *Device) updateCurrentWork() {
+func (d *Device) updateCurrentWork(ctx context.Context) {
 	var w *work.Work
 	if d.hasWork {
 		// If we already have work, we just need to check if there's new one
@@ -107,11 +108,10 @@ func (d *Device) updateCurrentWork() {
 			return
 		}
 	} else {
-		// If we don't have work, we block until we do. We need to watch for
-		// quit events too.
+		// If we don't have work, we block until we do.
 		select {
 		case w = <-d.newWork:
-		case <-d.quit:
+		case <-ctx.Done():
 			return
 		}
 	}
@@ -143,8 +143,8 @@ func (d *Device) updateCurrentWork() {
 	minrLog.Tracef("work data for work update: %x", d.work.Data)
 }
 
-func (d *Device) Run() {
-	err := d.runDevice()
+func (d *Device) Run(ctx context.Context) {
+	err := d.runDevice(ctx)
 	if err != nil {
 		minrLog.Errorf("Error on device: %v", err)
 	}
@@ -354,14 +354,10 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	}
 }
 
-func (d *Device) Stop() {
-	close(d.quit)
-}
-
-func (d *Device) SetWork(w *work.Work) {
+func (d *Device) SetWork(ctx context.Context, w *work.Work) {
 	select {
 	case d.newWork <- w:
-	case <-d.quit:
+	case <-ctx.Done():
 	}
 }
 
diff --git a/main.go b/main.go
index 931a24e..d1efd31 100644
--- a/main.go
+++ b/main.go
@@ -4,7 +4,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"os/signal"
 	"runtime"
 	"runtime/pprof"
 	"time"
@@ -86,15 +85,9 @@ func gominerMain() error {
 		go RunMonitor(m)
 	}
 
-	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt)
-	go func() {
-		<-c
-		mainLog.Warn("Got Control+C, exiting...")
-		m.Stop()
-	}()
+	ctx := shutdownListener()
 
-	m.Run()
+	m.Run(ctx)
 
 	return nil
 }
diff --git a/miner.go b/miner.go
index 7429641..18fba90 100644
--- a/miner.go
+++ b/miner.go
@@ -3,6 +3,7 @@
 package main
 
 import (
+	"context"
 	"fmt"
 	"sync"
 	"sync/atomic"
@@ -23,7 +24,6 @@ type Miner struct {
 	started          uint32
 	devices          []*Device
 	workDone         chan []byte
-	quit             chan struct{}
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
 	pool             *stratum.Stratum
@@ -32,7 +32,6 @@ type Miner struct {
 func NewMiner() (*Miner, error) {
 	m := &Miner{
 		workDone:         make(chan []byte, 10),
-		quit:             make(chan struct{}),
 		needsWorkRefresh: make(chan struct{}),
 	}
 
@@ -59,12 +58,12 @@ func NewMiner() (*Miner, error) {
 	return m, nil
 }
 
-func (m *Miner) workSubmitThread() {
+func (m *Miner) workSubmitThread(ctx context.Context) {
 	defer m.wg.Done()
 
 	for {
 		select {
-		case <-m.quit:
+		case <-ctx.Done():
 			return
 		case data := <-m.workDone:
 			// Only use that is we are not using a pool.
@@ -84,7 +83,7 @@ func (m *Miner) workSubmitThread() {
 
 					select {
 					case m.needsWorkRefresh <- struct{}{}:
-					case <-m.quit:
+					case <-ctx.Done():
 					}
 				}
 			} else {
@@ -107,7 +106,7 @@ func (m *Miner) workSubmitThread() {
 
 					select {
 					case m.needsWorkRefresh <- struct{}{}:
-					case <-m.quit:
+					case <-ctx.Done():
 					}
 				}
 			}
@@ -115,7 +114,7 @@ func (m *Miner) workSubmitThread() {
 	}
 }
 
-func (m *Miner) workRefreshThread() {
+func (m *Miner) workRefreshThread(ctx context.Context) {
 	defer m.wg.Done()
 
 	t := time.NewTicker(100 * time.Millisecond)
@@ -129,7 +128,7 @@ func (m *Miner) workRefreshThread() {
 				minrLog.Errorf("Error in getwork: %v", err)
 			} else {
 				for _, d := range m.devices {
-					d.SetWork(work)
+					d.SetWork(ctx, work)
 				}
 			}
 		} else {
@@ -141,7 +140,7 @@ func (m *Miner) workRefreshThread() {
 					minrLog.Errorf("Error in getpoolwork: %v", err)
 				} else {
 					for _, d := range m.devices {
-						d.SetWork(work)
+						d.SetWork(ctx, work)
 					}
 				}
 			} else {
@@ -149,7 +148,7 @@ func (m *Miner) workRefreshThread() {
 			}
 		}
 		select {
-		case <-m.quit:
+		case <-ctx.Done():
 			return
 		case <-t.C:
 		case <-m.needsWorkRefresh:
@@ -157,7 +156,7 @@ func (m *Miner) workRefreshThread() {
 	}
 }
 
-func (m *Miner) printStatsThread() {
+func (m *Miner) printStatsThread(ctx context.Context) {
 	defer m.wg.Done()
 
 	t := time.NewTicker(time.Second * 5)
@@ -196,7 +195,7 @@ func (m *Miner) printStatsThread() {
 		}
 
 		select {
-		case <-m.quit:
+		case <-ctx.Done():
 			return
 		case <-t.C:
 		case <-m.needsWorkRefresh:
@@ -204,45 +203,38 @@ func (m *Miner) printStatsThread() {
 	}
 }
 
-func (m *Miner) Run() {
+func (m *Miner) Run(ctx context.Context) {
 	m.wg.Add(len(m.devices))
 
 	for _, d := range m.devices {
 		device := d
 		go func() {
-			device.Run()
+			device.Run(ctx)
 			device.Release()
 			m.wg.Done()
 		}()
 	}
 
 	m.wg.Add(1)
-	go m.workSubmitThread()
+	go m.workSubmitThread(ctx)
 
 	if cfg.Benchmark {
 		minrLog.Warn("Running in BENCHMARK mode! No real mining taking place!")
 		work := &work.Work{}
 		for _, d := range m.devices {
-			d.SetWork(work)
+			d.SetWork(ctx, work)
 		}
 	} else {
 		m.wg.Add(1)
-		go m.workRefreshThread()
+		go m.workRefreshThread(ctx)
 	}
 
 	m.wg.Add(1)
-	go m.printStatsThread()
+	go m.printStatsThread(ctx)
 
 	m.wg.Wait()
 }
 
-func (m *Miner) Stop() {
-	close(m.quit)
-	for _, d := range m.devices {
-		d.Stop()
-	}
-}
-
 func (m *Miner) Status() (uint64, uint64, uint64, uint64, float64) {
 	if cfg.Pool != "" {
 		valid := atomic.LoadUint64(&m.pool.ValidShares)
diff --git a/signal.go b/signal.go
new file mode 100644
index 0000000..347cf7d
--- /dev/null
+++ b/signal.go
@@ -0,0 +1,58 @@
+// Copyright (c) 2013-2016 The btcsuite developers
+// Copyright (c) 2015-2016 The Decred developers
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"context"
+	"os"
+	"os/signal"
+)
+
+// shutdownRequestChannel is used to initiate shutdown from one of the
+// subsystems using the same code paths as when an interrupt signal is received.
+var shutdownRequestChannel = make(chan struct{})
+
+// interruptSignals defines the default signals to catch in order to do a proper
+// shutdown.  This may be modified during init depending on the platform.
+var interruptSignals = []os.Signal{os.Interrupt}
+
+// shutdownListener listens for OS Signals such as SIGINT (Ctrl+C) and shutdown
+// requests from shutdownRequestChannel.  It returns a context that is canceled
+// when either signal is received.
+func shutdownListener() context.Context {
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		interruptChannel := make(chan os.Signal, 1)
+		signal.Notify(interruptChannel, interruptSignals...)
+
+		// Listen for initial shutdown signal and cancel the returned context.
+		select {
+		case sig := <-interruptChannel:
+			minrLog.Infof("Received signal (%s).  Shutting down...", sig)
+
+		case <-shutdownRequestChannel:
+			minrLog.Infof("Shutdown requested.  Shutting down...")
+		}
+		cancel()
+
+		// Listen for repeated signals and display a message so the user
+		// knows the shutdown is in progress and the process is not
+		// hung.
+		for {
+			select {
+			case sig := <-interruptChannel:
+				minrLog.Infof("Received signal (%s).  Already "+
+					"shutting down...", sig)
+
+			case <-shutdownRequestChannel:
+				minrLog.Info("Shutdown requested.  Already " +
+					"shutting down...")
+			}
+		}
+	}()
+
+	return ctx
+}
diff --git a/signal_syscall.go b/signal_syscall.go
new file mode 100644
index 0000000..545d82f
--- /dev/null
+++ b/signal_syscall.go
@@ -0,0 +1,15 @@
+// Copyright (c) 2021-2022 The Decred developers
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+//
+//go:build windows || aix || android || darwin || dragonfly || freebsd || hurd || illumos || ios || linux || netbsd || openbsd || solaris
+
+package main
+
+import (
+	"syscall"
+)
+
+func init() {
+	interruptSignals = append(interruptSignals, syscall.SIGTERM)
+}

From 92c6300ffbf05606d52867d6a43309ff65d3994c Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 10:37:26 +0100
Subject: [PATCH 117/150] build: blake256 is a direct dependency.

The project directly makes use of blake256 in miner.go, however it was
marked as an indirect dependency in go.mod.

This is fixed by `go mod tidy`.
---
 go.mod | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go.mod b/go.mod
index 1f53236..4958cc8 100644
--- a/go.mod
+++ b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0
 	github.com/decred/dcrd/chaincfg/chainhash v1.0.4
 	github.com/decred/dcrd/chaincfg/v3 v3.2.0
+	github.com/decred/dcrd/crypto/blake256 v1.0.1
 	github.com/decred/dcrd/dcrutil/v4 v4.0.1
 	github.com/decred/dcrd/wire v1.6.0
 	github.com/decred/go-socks v1.1.0
@@ -20,7 +21,6 @@ require (
 	github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 // indirect
 	github.com/dchest/siphash v1.2.3 // indirect
 	github.com/decred/base58 v1.0.5 // indirect
-	github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect
 	github.com/decred/dcrd/crypto/ripemd160 v1.0.2 // indirect
 	github.com/decred/dcrd/dcrec v1.0.1 // indirect
 	github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3 // indirect

From 421a9942726cc4bdfba1228fbc635ffe79a63c15 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:19:46 +0100
Subject: [PATCH 118/150] build: Don't install beignet-dev in CI.

---
 .github/workflows/go.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 87ae3b6..1061f52 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Install build dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y beignet-dev nvidia-cuda-dev nvidia-cuda-toolkit ocl-icd-opencl-dev opencl-headers nvidia-opencl-dev
+          sudo apt-get install -y nvidia-cuda-dev nvidia-cuda-toolkit ocl-icd-opencl-dev opencl-headers nvidia-opencl-dev
       - name: Install Linters
         run: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.53.3
       - name: Build OpenCL

From c402662c82b50f54065bc78b283b640a2c1fb258 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:45:33 +0100
Subject: [PATCH 119/150] build: Update CI toolchain.

Use latest GitHub Actions and linter. Update to go 1.21.
---
 .github/workflows/go.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 1061f52..84167bc 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -6,23 +6,23 @@ permissions:
 jobs:
   build:
     name: Go CI
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
-        go: ['1.19', '1.20']
+        go: ['1.20', '1.21']
     steps:
       - name: Set up Go
-        uses: actions/setup-go@fac708d6674e30b6ba41289acaab6d4b75aa0753 #v4.0.1
+        uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe #v4.1.0
         with:
           go-version: ${{ matrix.go }}
       - name: Check out source
-        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 #v3.5.3
+        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac #v4.0.0
       - name: Install build dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y nvidia-cuda-dev nvidia-cuda-toolkit ocl-icd-opencl-dev opencl-headers nvidia-opencl-dev
       - name: Install Linters
-        run: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.53.3
+        run: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.54.2
       - name: Build OpenCL
         env:
           CL_TARGET_OPENCL_VERSION: "220"

From 63d0822992cd0e8dbf524b898c05ac660a857646 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:45:43 +0100
Subject: [PATCH 120/150] build: Add run_tests.sh.

A script to run tests and linters is useful for development.
---
 .github/workflows/go.yml |  7 ++-----
 run_tests.sh             | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)
 create mode 100755 run_tests.sh

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 84167bc..879ff50 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -26,9 +26,6 @@ jobs:
       - name: Build OpenCL
         env:
           CL_TARGET_OPENCL_VERSION: "220"
-
         run: go build -tags opencl ./...
-      - name: Lint
-        run: golangci-lint -c ./.golangci.yml run
-      - name: Test
-        run: go test -tags opencl -v ./...
+      - name: Test and Lint
+        run: ./run_tests.sh
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000..e314878
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2020-2023 The Decred developers
+# Use of this source code is governed by an ISC
+# license that can be found in the LICENSE file.
+#
+# Usage:
+#   ./run_tests.sh
+
+set -e
+
+go version
+
+# Run tests.
+go test -tags opencl -v ./...
+
+# Run linters.
+golangci-lint run
+
+echo "-----------------------------"
+echo "Tests completed successfully!"

From 600f6ea83e79550780e0b2d4fbb2115505e3e30e Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:45:55 +0100
Subject: [PATCH 121/150] Move ithOrFirstInt to where it is used.

The func ithOrFirstInt is declared in config.go but only used in
cudevice.go. Moving it means the nolint directive can be removed.
---
 config.go   | 10 ----------
 cudevice.go |  8 ++++++++
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/config.go b/config.go
index 5b28b0c..d891c21 100644
--- a/config.go
+++ b/config.go
@@ -262,16 +262,6 @@ func commaListToInts(s string) ([]int, error) {
 	return res, nil
 }
 
-// ithOrFirstInt returns s[index] if len(s) > index or s[0] if not.
-//
-//nolint:unused
-func ithOrFirstInt(s []int, index int) int {
-	if index < len(s) {
-		return s[index]
-	}
-	return s[0]
-}
-
 // loadConfig initializes and parses the config using a config file and command
 // line options.
 //
diff --git a/cudevice.go b/cudevice.go
index 90ae14e..557b497 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -118,6 +118,14 @@ func deviceStats(index int) (uint32, uint32) {
 	return fanPercent, temperature
 }
 
+// ithOrFirstInt returns s[index] if len(s) > index or s[0] if not.
+func ithOrFirstInt(s []int, index int) int {
+	if index < len(s) {
+		return s[index]
+	}
+	return s[0]
+}
+
 // unsupported -- just here for compilation
 func fanControlSet(index int, fanCur uint32, tempTargetType string,
 	fanChangeLevel string) {

From 43e4b271a60616dde5fb82a8fbf7e403db19a4e6 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:46:33 +0100
Subject: [PATCH 122/150] Format error strings.

Remove leading caps and trailing punctuation.
---
 cladldevice.go     | 10 +++++-----
 cldevice.go        |  6 +++---
 config.go          | 34 +++++++++++++++++-----------------
 cudevice.go        |  4 ++--
 getwork.go         | 16 ++++++++--------
 miner.go           |  2 +-
 stratum/stratum.go |  8 ++++----
 util/util.go       |  2 +-
 8 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/cladldevice.go b/cladldevice.go
index 9d17216..21d33ab 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -185,12 +185,12 @@ func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
 	var platformID cl.CL_platform_id
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return platformID, nil, fmt.Errorf("Could not get CL platforms: %v", err)
+		return platformID, nil, fmt.Errorf("could not get CL platforms: %v", err)
 	}
 	platformID = platformIDs[0]
 	CLdeviceIDs, err := getCLDevices(platformID)
 	if err != nil {
-		return platformID, nil, fmt.Errorf("Could not get CL devices for platform: %v", err)
+		return platformID, nil, fmt.Errorf("could not get CL devices for platform: %v", err)
 	}
 	return platformID, CLdeviceIDs, nil
 }
@@ -299,7 +299,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	// Load kernel source.
 	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
-		return nil, fmt.Errorf("Could not load kernel source: %v", err)
+		return nil, fmt.Errorf("could not load kernel source: %v", err)
 	}
 
 	// Create the program.
@@ -580,14 +580,14 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("Could not get CL platforms: %v", err)
+		return nil, 0, fmt.Errorf("could not get CL platforms: %v", err)
 	}
 
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("Could not get CL devices for platform: %v", err)
+			return nil, 0, fmt.Errorf("could not get CL devices for platform: %v", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
diff --git a/cldevice.go b/cldevice.go
index 825971c..3f2815b 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -420,7 +420,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	// Load kernel source.
 	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
-		return nil, fmt.Errorf("Could not load kernel source: %v", err)
+		return nil, fmt.Errorf("could not load kernel source: %v", err)
 	}
 
 	// Create the program.
@@ -708,14 +708,14 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("Could not get CL platforms: %v", err)
+		return nil, 0, fmt.Errorf("could not get CL platforms: %v", err)
 	}
 
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("Could not get CL devices for platform: %v", err)
+			return nil, 0, fmt.Errorf("could not get CL devices for platform: %v", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
diff --git a/config.go b/config.go
index d891c21..5328033 100644
--- a/config.go
+++ b/config.go
@@ -368,7 +368,7 @@ func loadConfig() (*config, []string, error) {
 			for i := range specifiedAutocalibrates {
 				j, err := strconv.Atoi(specifiedAutocalibrates[i])
 				if err != nil {
-					err := fmt.Errorf("Could not convert autocalibration "+
+					err := fmt.Errorf("could not convert autocalibration "+
 						"(%v) to int: %s", specifiedAutocalibrates[i],
 						err.Error())
 					fmt.Fprintln(os.Stderr, err)
@@ -382,7 +382,7 @@ func loadConfig() (*config, []string, error) {
 			cfg.AutocalibrateInts = make([]int, 1)
 			i, err := strconv.Atoi(cfg.Autocalibrate)
 			if err != nil {
-				err := fmt.Errorf("Could not convert autocalibration %v "+
+				err := fmt.Errorf("could not convert autocalibration %v "+
 					"to int: %s", cfg.Autocalibrate, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
@@ -404,7 +404,7 @@ func loadConfig() (*config, []string, error) {
 			for i := range specifiedDevices {
 				j, err := strconv.Atoi(specifiedDevices[i])
 				if err != nil {
-					err := fmt.Errorf("Could not convert device number %v "+
+					err := fmt.Errorf("could not convert device number %v "+
 						"(%v) to int: %s", i+1, specifiedDevices[i],
 						err.Error())
 					fmt.Fprintln(os.Stderr, err)
@@ -418,7 +418,7 @@ func loadConfig() (*config, []string, error) {
 			cfg.DeviceIDs = make([]int, 1)
 			i, err := strconv.Atoi(cfg.Devices)
 			if err != nil {
-				err := fmt.Errorf("Could not convert specified device %v "+
+				err := fmt.Errorf("could not convert specified device %v "+
 					"to int: %s", cfg.Devices, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
@@ -437,7 +437,7 @@ func loadConfig() (*config, []string, error) {
 			for i := range specifiedIntensities {
 				j, err := strconv.Atoi(specifiedIntensities[i])
 				if err != nil {
-					err := fmt.Errorf("Could not convert intensity "+
+					err := fmt.Errorf("could not convert intensity "+
 						"(%v) to int: %s", specifiedIntensities[i],
 						err.Error())
 					fmt.Fprintln(os.Stderr, err)
@@ -451,7 +451,7 @@ func loadConfig() (*config, []string, error) {
 			cfg.IntensityInts = make([]int, 1)
 			i, err := strconv.Atoi(cfg.Intensity)
 			if err != nil {
-				err := fmt.Errorf("Could not convert intensity %v "+
+				err := fmt.Errorf("could not convert intensity %v "+
 					"to int: %s", cfg.Intensity, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
@@ -464,8 +464,8 @@ func loadConfig() (*config, []string, error) {
 	for i := range cfg.IntensityInts {
 		if (cfg.IntensityInts[i] < minIntensity) ||
 			(cfg.IntensityInts[i] > maxIntensity) {
-			err := fmt.Errorf("Intensity %v not within "+
-				"range %v to %v.", cfg.IntensityInts[i], minIntensity,
+			err := fmt.Errorf("intensity %v not within "+
+				"range %v to %v", cfg.IntensityInts[i], minIntensity,
 				maxIntensity)
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
@@ -489,7 +489,7 @@ func loadConfig() (*config, []string, error) {
 			for i := range specifiedTempTargets {
 				j, err := strconv.Atoi(specifiedTempTargets[i])
 				if err != nil {
-					err := fmt.Errorf("Could not convert temptarget "+
+					err := fmt.Errorf("could not convert temptarget "+
 						"(%v) to int: %s", specifiedTempTargets[i],
 						err.Error())
 					fmt.Fprintln(os.Stderr, err)
@@ -503,7 +503,7 @@ func loadConfig() (*config, []string, error) {
 			cfg.TempTargetInts = make([]uint32, 1)
 			i, err := strconv.Atoi(cfg.TempTarget)
 			if err != nil {
-				err := fmt.Errorf("Could not convert temptarget %v "+
+				err := fmt.Errorf("could not convert temptarget %v "+
 					"to int: %s", cfg.TempTarget, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
@@ -521,13 +521,13 @@ func loadConfig() (*config, []string, error) {
 
 	for i := range cfg.TempTargetInts {
 		if cfg.TempTargetInts[i] < minTempTarget {
-			err := fmt.Errorf("Temp target %v is lower than minimum %v",
+			err := fmt.Errorf("temp target %v is lower than minimum %v",
 				cfg.TempTargetInts[i], minTempTarget)
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
 		}
 		if cfg.TempTargetInts[i] > maxTempTarget {
-			err := fmt.Errorf("Temp target %v is higher than maximum %v",
+			err := fmt.Errorf("temp target %v is higher than maximum %v",
 				cfg.TempTargetInts[i], maxTempTarget)
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
@@ -543,7 +543,7 @@ func loadConfig() (*config, []string, error) {
 			for i := range specifiedWorkSizes {
 				j, err := strconv.Atoi(specifiedWorkSizes[i])
 				if err != nil {
-					err := fmt.Errorf("Could not convert worksize "+
+					err := fmt.Errorf("could not convert worksize "+
 						"(%v) to int: %s", specifiedWorkSizes[i],
 						err.Error())
 					fmt.Fprintln(os.Stderr, err)
@@ -557,7 +557,7 @@ func loadConfig() (*config, []string, error) {
 			cfg.WorkSizeInts = make([]uint32, 1)
 			i, err := strconv.Atoi(cfg.WorkSize)
 			if err != nil {
-				err := fmt.Errorf("Could not convert worksize %v "+
+				err := fmt.Errorf("could not convert worksize %v "+
 					"to int: %s", cfg.WorkSize, err.Error())
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
@@ -569,19 +569,19 @@ func loadConfig() (*config, []string, error) {
 
 	for i := range cfg.WorkSizeInts {
 		if cfg.WorkSizeInts[i] < 256 {
-			err := fmt.Errorf("Too small WorkSize passed: %v, min 256",
+			err := fmt.Errorf("too small WorkSize passed: %v, min 256",
 				cfg.WorkSizeInts[i])
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
 		}
 		if cfg.WorkSizeInts[i] > maxWorkSize {
-			err := fmt.Errorf("Too big WorkSize passed: %v, max %v",
+			err := fmt.Errorf("too big WorkSize passed: %v, max %v",
 				cfg.WorkSizeInts[i], maxWorkSize)
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
 		}
 		if cfg.WorkSizeInts[i]%256 != 0 {
-			err := fmt.Errorf("Work size %v not a multiple of 256",
+			err := fmt.Errorf("work size %v not a multiple of 256",
 				cfg.WorkSizeInts[i])
 			fmt.Fprintln(os.Stderr, err)
 			return nil, nil, err
diff --git a/cudevice.go b/cudevice.go
index 557b497..79b5868 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -159,13 +159,13 @@ func getCUDevices() ([]cu.Device, error) {
 	minMinor := 5
 
 	if maj < minMajor || (maj == minMajor && min < minMinor) {
-		return nil, fmt.Errorf("Driver does not support CUDA %v.%v API", minMajor, minMinor)
+		return nil, fmt.Errorf("driver does not support CUDA %v.%v API", minMajor, minMinor)
 	}
 
 	var numDevices int
 	numDevices = cu.DeviceGetCount()
 	if numDevices < 1 {
-		return nil, fmt.Errorf("No devices found")
+		return nil, fmt.Errorf("no devices found")
 	}
 	devices := make([]cu.Device, numDevices)
 	for i := 0; i < numDevices; i++ {
diff --git a/getwork.go b/getwork.go
index 53d81f7..1fb3364 100644
--- a/getwork.go
+++ b/getwork.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 package main
 
@@ -133,7 +133,7 @@ func GetWork() (*work.Work, error) {
 	}
 
 	if httpResponse.Status != "200 OK" {
-		return nil, fmt.Errorf("HTTP %s: %s", httpResponse.Status, body)
+		return nil, fmt.Errorf("http status %s: %s", httpResponse.Status, body)
 	}
 
 	var res getWorkResponseJson
@@ -143,7 +143,7 @@ func GetWork() (*work.Work, error) {
 	}
 
 	if res.Error != nil {
-		return nil, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code,
+		return nil, fmt.Errorf("json error %d: %s", res.Error.Code,
 			res.Error.Message)
 	}
 
@@ -152,7 +152,7 @@ func GetWork() (*work.Work, error) {
 		return nil, err
 	}
 	if len(data) != 192 {
-		return nil, fmt.Errorf("Wrong data length: got %d, expected 192",
+		return nil, fmt.Errorf("wrong data length: got %d, expected 192",
 			len(data))
 	}
 	target, err := hex.DecodeString(res.Result.Target)
@@ -160,7 +160,7 @@ func GetWork() (*work.Work, error) {
 		return nil, err
 	}
 	if len(target) != 32 {
-		return nil, fmt.Errorf("Wrong target length: got %d, expected 32",
+		return nil, fmt.Errorf("wrong target length: got %d, expected 32",
 			len(target))
 	}
 
@@ -187,7 +187,7 @@ func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 		pool.PoolWork.NewWork = false
 
 		if pool.PoolWork.JobID == "" {
-			return nil, fmt.Errorf("No work available (no job id)")
+			return nil, fmt.Errorf("no work available (no job id)")
 		}
 
 		err := pool.PrepWork()
@@ -206,7 +206,7 @@ func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 		return pool.PoolWork.Work, nil
 	}
 
-	return nil, fmt.Errorf("No work available.")
+	return nil, fmt.Errorf("no work available")
 }
 
 // GetWork makes a getwork RPC call and returns the result (data and target)
@@ -261,7 +261,7 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	}
 
 	if res.Error != nil {
-		return false, fmt.Errorf("JSONRPC Error %d: %s", res.Error.Code,
+		return false, fmt.Errorf("json error %d: %s", res.Error.Code,
 			res.Error.Message)
 	}
 
diff --git a/miner.go b/miner.go
index 18fba90..e1d3427 100644
--- a/miner.go
+++ b/miner.go
@@ -50,7 +50,7 @@ func NewMiner() (*Miner, error) {
 	}
 
 	if deviceListEnabledCount == 0 {
-		return nil, fmt.Errorf("No devices started")
+		return nil, fmt.Errorf("no devices started")
 	}
 
 	m.started = uint32(time.Now().Unix())
diff --git a/stratum/stratum.go b/stratum/stratum.go
index 2646d6c..b89f4e7 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2016 The Decred developers.
+// Copyright (c) 2016-2023 The Decred developers.
 
 package stratum
 
@@ -34,7 +34,7 @@ import (
 var chainParams = chaincfg.MainNetParams()
 
 // ErrStratumStaleWork indicates that the work to send to the pool was stale.
-var ErrStratumStaleWork = fmt.Errorf("Stale work, throwing away")
+var ErrStratumStaleWork = errors.New("stale work, throwing away")
 
 // Stratum holds all the shared information for a stratum connection.
 // XXX most of these should be unexported and use getters/setters.
@@ -151,7 +151,7 @@ type Submit struct {
 }
 
 // errJsonType is an error for json that we do not expect.
-var errJsonType = errors.New("Unexpected type in json.")
+var errJsonType = errors.New("unexpected type in json")
 
 func sliceContains(s []uint64, e uint64) bool {
 	for _, a := range s {
@@ -188,7 +188,7 @@ func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string)
 	if strings.HasPrefix(pool, proto) {
 		pool = strings.Replace(pool, proto, "", 1)
 	} else {
-		err := errors.New("Only stratum pools supported.")
+		err := errors.New("only stratum pools supported")
 		return nil, err
 	}
 	var conn net.Conn
diff --git a/util/util.go b/util/util.go
index a7f71c4..40777fd 100644
--- a/util/util.go
+++ b/util/util.go
@@ -24,7 +24,7 @@ func reverseS(s string) (string, error) {
 	a := strings.Split(s, "")
 	sRev := ""
 	if len(a)%2 != 0 {
-		return "", fmt.Errorf("Incorrect input length")
+		return "", fmt.Errorf("incorrect input length")
 	}
 	for i := 0; i < len(a); i += 2 {
 		tmp := []string{a[i], a[i+1], sRev}

From 5fbb0c39084869489285311ba2eb8fed7b9366ad Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:46:50 +0100
Subject: [PATCH 123/150] Consistent string format verbs for errors.

Always use %v to print errors and %w to wrap errors.
---
 cladldevice.go | 10 +++++-----
 cldevice.go    | 12 ++++++------
 config.go      | 32 +++++++++++++-------------------
 getwork.go     |  4 ++--
 main.go        |  2 +-
 5 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/cladldevice.go b/cladldevice.go
index 21d33ab..5fb8247 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -185,12 +185,12 @@ func getCLInfo() (cl.CL_platform_id, []cl.CL_device_id, error) {
 	var platformID cl.CL_platform_id
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return platformID, nil, fmt.Errorf("could not get CL platforms: %v", err)
+		return platformID, nil, fmt.Errorf("could not get CL platforms: %w", err)
 	}
 	platformID = platformIDs[0]
 	CLdeviceIDs, err := getCLDevices(platformID)
 	if err != nil {
-		return platformID, nil, fmt.Errorf("could not get CL devices for platform: %v", err)
+		return platformID, nil, fmt.Errorf("could not get CL devices for platform: %w", err)
 	}
 	return platformID, CLdeviceIDs, nil
 }
@@ -299,7 +299,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	// Load kernel source.
 	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
-		return nil, fmt.Errorf("could not load kernel source: %v", err)
+		return nil, fmt.Errorf("could not load kernel source: %w", err)
 	}
 
 	// Create the program.
@@ -580,14 +580,14 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("could not get CL platforms: %v", err)
+		return nil, 0, fmt.Errorf("could not get CL platforms: %w", err)
 	}
 
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("could not get CL devices for platform: %v", err)
+			return nil, 0, fmt.Errorf("could not get CL devices for platform: %w", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
diff --git a/cldevice.go b/cldevice.go
index 3f2815b..758f2ef 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -71,7 +71,7 @@ func amdgpuFanPermissionsValid(index int) error {
 		if os.IsPermission(err) {
 			return fmt.Errorf("path %v is not writable", path)
 		} else {
-			return fmt.Errorf("path %v unusable %v", path, err)
+			return fmt.Errorf("path %v unusable %w", path, err)
 		}
 	}
 
@@ -155,7 +155,7 @@ func fanControlSet(index int, fanCur uint32, tempTargetType string,
 		fanNewValue, fanPath)
 	err := deviceStatsWriteSysfsEntry(fanPath, fanNewValue)
 	if err != nil {
-		minrLog.Errorf("DEV #%d unable to adjust fan: %v", index, err.Error())
+		minrLog.Errorf("DEV #%d unable to adjust fan: %v", index, err)
 	} else {
 		minrLog.Infof("DEV #%d successfully adjusted fan from %v%% to %v%% to "+
 			"%v temp", index, fanCur, fanNewPercent,
@@ -316,7 +316,7 @@ func deviceStatsWriteSysfsEntry(path string, value uint32) error {
 	stringValue := strconv.Itoa(int(value)) + "\n"
 	err := ioutil.WriteFile(path, []byte(stringValue), 0644)
 	if err != nil {
-		return fmt.Errorf("unable to write %v to %v: %v", value, path, err)
+		return fmt.Errorf("unable to write %v to %v: %w", value, path, err)
 	}
 
 	return nil
@@ -420,7 +420,7 @@ func NewDevice(index int, order int, platformID cl.CL_platform_id, deviceID cl.C
 	// Load kernel source.
 	progSrc, progSize, err := loadProgramSource(cfg.ClKernel)
 	if err != nil {
-		return nil, fmt.Errorf("could not load kernel source: %v", err)
+		return nil, fmt.Errorf("could not load kernel source: %w", err)
 	}
 
 	// Create the program.
@@ -708,14 +708,14 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("could not get CL platforms: %v", err)
+		return nil, 0, fmt.Errorf("could not get CL platforms: %w", err)
 	}
 
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("could not get CL devices for platform: %v", err)
+			return nil, 0, fmt.Errorf("could not get CL devices for platform: %w", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
diff --git a/config.go b/config.go
index 5328033..93ff2f9 100644
--- a/config.go
+++ b/config.go
@@ -245,8 +245,7 @@ func commaListToInts(s string) ([]int, error) {
 		for i := range split {
 			j, err := strconv.Atoi(split[i])
 			if err != nil {
-				err := fmt.Errorf("item %q is not an int: %v"+
-					split[i], err)
+				err := fmt.Errorf("item %q is not an int: %w", split[i], err)
 				return nil, err
 			}
 			res[i] = j
@@ -254,7 +253,7 @@ func commaListToInts(s string) ([]int, error) {
 	} else {
 		i, err := strconv.Atoi(s)
 		if err != nil {
-			return nil, fmt.Errorf("%q is not an int: %v", s, err)
+			return nil, fmt.Errorf("%q is not an int: %w", s, err)
 		}
 		res = []int{i}
 	}
@@ -369,8 +368,7 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedAutocalibrates[i])
 				if err != nil {
 					err := fmt.Errorf("could not convert autocalibration "+
-						"(%v) to int: %s", specifiedAutocalibrates[i],
-						err.Error())
+						"(%v) to int: %w", specifiedAutocalibrates[i], err)
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -383,7 +381,7 @@ func loadConfig() (*config, []string, error) {
 			i, err := strconv.Atoi(cfg.Autocalibrate)
 			if err != nil {
 				err := fmt.Errorf("could not convert autocalibration %v "+
-					"to int: %s", cfg.Autocalibrate, err.Error())
+					"to int: %w", cfg.Autocalibrate, err)
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
@@ -405,8 +403,7 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedDevices[i])
 				if err != nil {
 					err := fmt.Errorf("could not convert device number %v "+
-						"(%v) to int: %s", i+1, specifiedDevices[i],
-						err.Error())
+						"(%v) to int: %w", i+1, specifiedDevices[i], err)
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -419,7 +416,7 @@ func loadConfig() (*config, []string, error) {
 			i, err := strconv.Atoi(cfg.Devices)
 			if err != nil {
 				err := fmt.Errorf("could not convert specified device %v "+
-					"to int: %s", cfg.Devices, err.Error())
+					"to int: %w", cfg.Devices, err)
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
@@ -438,8 +435,7 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedIntensities[i])
 				if err != nil {
 					err := fmt.Errorf("could not convert intensity "+
-						"(%v) to int: %s", specifiedIntensities[i],
-						err.Error())
+						"(%v) to int: %w", specifiedIntensities[i], err)
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -452,7 +448,7 @@ func loadConfig() (*config, []string, error) {
 			i, err := strconv.Atoi(cfg.Intensity)
 			if err != nil {
 				err := fmt.Errorf("could not convert intensity %v "+
-					"to int: %s", cfg.Intensity, err.Error())
+					"to int: %w", cfg.Intensity, err)
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
@@ -490,8 +486,7 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedTempTargets[i])
 				if err != nil {
 					err := fmt.Errorf("could not convert temptarget "+
-						"(%v) to int: %s", specifiedTempTargets[i],
-						err.Error())
+						"(%v) to int: %w", specifiedTempTargets[i], err)
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -504,7 +499,7 @@ func loadConfig() (*config, []string, error) {
 			i, err := strconv.Atoi(cfg.TempTarget)
 			if err != nil {
 				err := fmt.Errorf("could not convert temptarget %v "+
-					"to int: %s", cfg.TempTarget, err.Error())
+					"to int: %w", cfg.TempTarget, err)
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
@@ -544,8 +539,7 @@ func loadConfig() (*config, []string, error) {
 				j, err := strconv.Atoi(specifiedWorkSizes[i])
 				if err != nil {
 					err := fmt.Errorf("could not convert worksize "+
-						"(%v) to int: %s", specifiedWorkSizes[i],
-						err.Error())
+						"(%v) to int: %w", specifiedWorkSizes[i], err)
 					fmt.Fprintln(os.Stderr, err)
 					return nil, nil, err
 				}
@@ -558,7 +552,7 @@ func loadConfig() (*config, []string, error) {
 			i, err := strconv.Atoi(cfg.WorkSize)
 			if err != nil {
 				err := fmt.Errorf("could not convert worksize %v "+
-					"to int: %s", cfg.WorkSize, err.Error())
+					"to int: %w", cfg.WorkSize, err)
 				fmt.Fprintln(os.Stderr, err)
 				return nil, nil, err
 			}
@@ -612,7 +606,7 @@ func loadConfig() (*config, []string, error) {
 
 	// Parse, validate, and set debug log level(s).
 	if err := parseAndSetDebugLevels(cfg.DebugLevel); err != nil {
-		err := fmt.Errorf("%s: %v", funcName, err.Error())
+		err := fmt.Errorf("%s: %w", funcName, err)
 		fmt.Fprintln(os.Stderr, err)
 		fmt.Fprintln(os.Stderr, usageMessage)
 		return nil, nil, err
diff --git a/getwork.go b/getwork.go
index 1fb3364..68b6e0a 100644
--- a/getwork.go
+++ b/getwork.go
@@ -128,7 +128,7 @@ func GetWork() (*work.Work, error) {
 	body, err := ioutil.ReadAll(httpResponse.Body)
 	httpResponse.Body.Close()
 	if err != nil {
-		err = fmt.Errorf("error reading json reply: %v", err)
+		err = fmt.Errorf("error reading json reply: %w", err)
 		return nil, err
 	}
 
@@ -245,7 +245,7 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	body, err := ioutil.ReadAll(httpResponse.Body)
 	httpResponse.Body.Close()
 	if err != nil {
-		err = fmt.Errorf("error reading json reply: %v", err)
+		err = fmt.Errorf("error reading json reply: %w", err)
 		return false, err
 	}
 
diff --git a/main.go b/main.go
index d1efd31..bd22b3a 100644
--- a/main.go
+++ b/main.go
@@ -52,7 +52,7 @@ func gominerMain() error {
 	if cfg.CPUProfile != "" {
 		f, err := os.Create(cfg.CPUProfile)
 		if err != nil {
-			mainLog.Errorf("Unable to create cpu profile: %v", err.Error())
+			mainLog.Errorf("Unable to create cpu profile: %v", err)
 			return err
 		}
 		pprof.StartCPUProfile(f)

From 2f75233fa97178b8f8b18b4ccc65a004e7f05cf0 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 11:47:02 +0100
Subject: [PATCH 124/150] Use errors.Is and errors.As.

---
 config.go          | 10 +++++++---
 miner.go           |  5 +++--
 stratum/stratum.go |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/config.go b/config.go
index 93ff2f9..bba811f 100644
--- a/config.go
+++ b/config.go
@@ -4,6 +4,7 @@
 package main
 
 import (
+	"errors"
 	"fmt"
 	"net"
 	"os"
@@ -300,7 +301,8 @@ func loadConfig() (*config, []string, error) {
 	preParser := flags.NewParser(&preCfg, flags.Default)
 	_, err = preParser.Parse()
 	if err != nil {
-		if e, ok := err.(*flags.Error); !ok || e.Type != flags.ErrHelp {
+		var e *flags.Error
+		if !errors.As(err, &e) || e.Type != flags.ErrHelp {
 			preParser.WriteHelp(os.Stderr)
 		}
 		return nil, nil, err
@@ -325,7 +327,8 @@ func loadConfig() (*config, []string, error) {
 	parser := flags.NewParser(&cfg, flags.Default)
 	err = flags.NewIniParser(parser).ParseFile(preCfg.ConfigFile)
 	if err != nil {
-		if _, ok := err.(*os.PathError); !ok {
+		var e *os.PathError
+		if !errors.As(err, &e) {
 			fmt.Fprintln(os.Stderr, err)
 			parser.WriteHelp(os.Stderr)
 			return nil, nil, err
@@ -336,7 +339,8 @@ func loadConfig() (*config, []string, error) {
 	// Parse command line options again to ensure they take precedence.
 	remainingArgs, err := parser.Parse()
 	if err != nil {
-		if e, ok := err.(*flags.Error); !ok || e.Type != flags.ErrHelp {
+		var e *flags.Error
+		if !errors.As(err, &e) || e.Type != flags.ErrHelp {
 			parser.WriteHelp(os.Stderr)
 		}
 		return nil, nil, err
diff --git a/miner.go b/miner.go
index e1d3427..0f55788 100644
--- a/miner.go
+++ b/miner.go
@@ -4,6 +4,7 @@ package main
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"sync"
 	"sync/atomic"
@@ -89,8 +90,8 @@ func (m *Miner) workSubmitThread(ctx context.Context) {
 			} else {
 				submitted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
-					switch err {
-					case stratum.ErrStratumStaleWork:
+					switch {
+					case errors.Is(err, stratum.ErrStratumStaleWork):
 						atomic.AddUint64(&m.staleShares, 1)
 						minrLog.Debugf("Share submitted to pool was stale")
 
diff --git a/stratum/stratum.go b/stratum/stratum.go
index b89f4e7..db38de9 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -283,7 +283,7 @@ func (s *Stratum) Listen() {
 	for {
 		result, err := s.Reader.ReadString('\n')
 		if err != nil {
-			if err == io.EOF {
+			if errors.Is(err, io.EOF) {
 				log.Error("Connection lost!  Reconnecting.")
 				err = s.Reconnect()
 				if err != nil {

From e5cdae679540607c3af9c586fc19f16faea0e3ed Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:10:57 +0100
Subject: [PATCH 125/150] Remove unnecessary else branches.

---
 nvml/nvml.go | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/nvml/nvml.go b/nvml/nvml.go
index 15a7a23..0a6d189 100755
--- a/nvml/nvml.go
+++ b/nvml/nvml.go
@@ -58,19 +58,14 @@ func (r Result) Error() string {
 }
 
 func (r Result) SuccessQ() bool {
-	if r.code == 0 {
-		return true
-	} else {
-		return false
-	}
+	return r.code == 0
 }
 
 func NewResult(r C.nvmlReturn_t) error {
 	if r == 0 {
 		return nil
-	} else {
-		return &Result{r}
 	}
+	return &Result{r}
 }
 
 func Init() error {

From 2fcf378661b54473985bb27711a55788880b3595 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:11:55 +0100
Subject: [PATCH 126/150] Remove unnecessary zero value allocation.

---
 nvml/nvml.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nvml/nvml.go b/nvml/nvml.go
index 0a6d189..9e24344 100755
--- a/nvml/nvml.go
+++ b/nvml/nvml.go
@@ -84,7 +84,7 @@ func ErrorString(r Result) string {
 }
 
 func DeviceCount() (int, error) {
-	var count C.uint = 0
+	var count C.uint
 	r := NewResult(C.nvmlDeviceGetCount(&count))
 	return int(count), r
 }

From bd34950c7d5b2ca3396230d4c0b4be76f92109c6 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:13:17 +0100
Subject: [PATCH 127/150] Reduce scope of const, use lower case naming.

---
 nvml/nvml.go | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nvml/nvml.go b/nvml/nvml.go
index 9e24344..a32c8ea 100755
--- a/nvml/nvml.go
+++ b/nvml/nvml.go
@@ -109,18 +109,17 @@ func DeviceComputeMode(dh DeviceHandle) (ComputeMode, error) {
 
 //device name
 
-const STRING_BUFFER_SIZE = 100
-
 func makeStringBuffer(sz int) *C.char {
 	b := make([]byte, sz)
 	return C.CString(string(b))
 }
 
 func DeviceName(dh DeviceHandle) (string, error) {
-	var name *C.char = makeStringBuffer(STRING_BUFFER_SIZE)
+	const stringBufferSize = 100
+	var name *C.char = makeStringBuffer(stringBufferSize)
 	defer C.free(unsafe.Pointer(name))
-	r := NewResult(C.nvmlDeviceGetName(dh.handle, name, C.uint(STRING_BUFFER_SIZE)))
-	return C.GoStringN(name, STRING_BUFFER_SIZE), r
+	r := NewResult(C.nvmlDeviceGetName(dh.handle, name, C.uint(stringBufferSize)))
+	return C.GoStringN(name, stringBufferSize), r
 }
 
 type MemoryInformation struct {

From 91033b35f73eceab49d04db4b15fdeaa1a724d0c Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:14:49 +0100
Subject: [PATCH 128/150] Capitalize ID in variable names.

---
 nvml/nvml.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nvml/nvml.go b/nvml/nvml.go
index a32c8ea..0f574aa 100755
--- a/nvml/nvml.go
+++ b/nvml/nvml.go
@@ -143,12 +143,12 @@ func DeviceMemoryInformation(dh DeviceHandle) (MemoryInformation, error) {
 }
 
 type PCIInformation struct {
-	BusId       string `json:"bus_id"`
+	BusID       string `json:"bus_id"`
 	Domain      uint   `json:"domain"`
 	Bus         uint   `json:"bus"`
 	Device      uint   `json:"device"`
-	DeviceId    uint   `json:"device_id"`
-	SubSystemId uint   `json:"subsystem_id"`
+	DeviceID    uint   `json:"device_id"`
+	SubSystemID uint   `json:"subsystem_id"`
 }
 
 func DevicePCIInformation(dh DeviceHandle) (PCIInformation, error) {
@@ -156,13 +156,13 @@ func DevicePCIInformation(dh DeviceHandle) (PCIInformation, error) {
 	r := NewResult(C.nvmlDeviceGetPciInfo(dh.handle, &temp))
 	if r == nil {
 		res := PCIInformation{
-			BusId: string(C.GoBytes(unsafe.Pointer(&temp.busId),
+			BusID: string(C.GoBytes(unsafe.Pointer(&temp.busId),
 				C.NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)),
 			Domain:      uint(temp.domain),
 			Bus:         uint(temp.bus),
 			Device:      uint(temp.device),
-			DeviceId:    uint(temp.pciDeviceId),
-			SubSystemId: uint(temp.pciSubSystemId),
+			DeviceID:    uint(temp.pciDeviceId),
+			SubSystemID: uint(temp.pciSubSystemId),
 		}
 		return res, nil
 	}

From 33e7ebc6e9bf70747fb9eb892c707e840f79c414 Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:16:35 +0100
Subject: [PATCH 129/150] End comments with periods.

---
 cldevice.go |  2 +-
 device.go   |  6 +++---
 getwork.go  | 12 ++++++------
 version.go  |  1 -
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/cldevice.go b/cldevice.go
index 758f2ef..a8f2671 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -61,7 +61,7 @@ func amdgpuFanPercentToValue(percent uint32) uint32 {
 	return value
 }
 
-// validate that we can write to the AMDGPU sysfs fan path
+// validate that we can write to the AMDGPU sysfs fan path.
 func amdgpuFanPermissionsValid(index int) error {
 	path := amdgpuGetSysfsPath(index, "fan")
 
diff --git a/device.go b/device.go
index a7ecd54..959662f 100644
--- a/device.go
+++ b/device.go
@@ -35,7 +35,7 @@ func init() {
 	randDeviceOffset2 = buf[1]
 }
 
-// Constants for fan and temperature bits
+// Constants for fan and temperature bits.
 const (
 	ADLFanFailSafe            = uint32(80)
 	AMDGPUFanFailSafe         = uint32(204)
@@ -150,7 +150,7 @@ func (d *Device) Run(ctx context.Context) {
 	}
 }
 
-// This is pretty hacky/proof-of-concepty
+// This is pretty hacky/proof-of-concepty.
 func (d *Device) fanControl() {
 	d.Lock()
 	defer d.Unlock()
@@ -388,7 +388,7 @@ func (d *Device) PrintStats() {
 	}
 }
 
-// UpdateFanTemp updates a device's statistics
+// UpdateFanTemp updates a device's statistics.
 func (d *Device) UpdateFanTemp() {
 	d.Lock()
 	defer d.Unlock()
diff --git a/getwork.go b/getwork.go
index 68b6e0a..6a0ca0f 100644
--- a/getwork.go
+++ b/getwork.go
@@ -94,7 +94,7 @@ const (
 	RequestTimeout     int = 5
 )
 
-// GetWork makes a getwork RPC call and returns the result (data and target)
+// GetWork makes a getwork RPC call and returns the result (data and target).
 func GetWork() (*work.Work, error) {
 	// Generate a request to the configured RPC server.
 	protocol := "http"
@@ -178,12 +178,12 @@ func GetWork() (*work.Work, error) {
 	return w, nil
 }
 
-// GetPoolWork gets work from a stratum enabled pool
+// GetPoolWork gets work from a stratum enabled pool.
 func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
-	// Get Next work for stratum and mark it as used
+	// Get Next work for stratum and mark it as used.
 	if pool.PoolWork.NewWork {
 		poolLog.Debug("Received new work from pool.")
-		// Mark used
+		// Mark used.
 		pool.PoolWork.NewWork = false
 
 		if pool.PoolWork.JobID == "" {
@@ -209,7 +209,7 @@ func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 	return nil, fmt.Errorf("no work available")
 }
 
-// GetWork makes a getwork RPC call and returns the result (data and target)
+// GetWork makes a getwork RPC call and returns the result (data and target).
 func GetWorkSubmit(data []byte) (bool, error) {
 	// Generate a request to the configured RPC server.
 	protocol := "http"
@@ -268,7 +268,7 @@ func GetWorkSubmit(data []byte) (bool, error) {
 	return res.Result, nil
 }
 
-// GetPoolWorkSubmit sends the result to the stratum enabled pool
+// GetPoolWorkSubmit sends the result to the stratum enabled pool.
 func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
 	pool.Lock()
 	defer pool.Unlock()
diff --git a/version.go b/version.go
index c9c7b64..ddca950 100644
--- a/version.go
+++ b/version.go
@@ -24,7 +24,6 @@ import (
 	"strings"
 )
 
-// semanticAlphabet
 const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-"
 
 // These constants define the application version and follow the semantic

From bf7e790db403958a34eb68872f5263dd9c673c1f Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:29:52 +0100
Subject: [PATCH 130/150] Don't use deprecated ioutil package.

---
 cldevice.go | 5 ++---
 getwork.go  | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cldevice.go b/cldevice.go
index a8f2671..467b7ba 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -11,7 +11,6 @@ import (
 	"context"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"math"
 	"os"
 	"runtime"
@@ -94,7 +93,7 @@ func amdgpuGetSysfsPath(index int, field string) string {
 	hwmonName := ""
 
 	// open hwmon base path and scan for the numbered entry
-	files, err := ioutil.ReadDir(hwmonBasePath)
+	files, err := os.ReadDir(hwmonBasePath)
 	if err != nil {
 		minrLog.Errorf("unable to read AMDGPU sysfs dir %v: %v", hwmonBasePath,
 			err)
@@ -314,7 +313,7 @@ func deviceStatsReadSysfsEntry(path string) uint32 {
 
 func deviceStatsWriteSysfsEntry(path string, value uint32) error {
 	stringValue := strconv.Itoa(int(value)) + "\n"
-	err := ioutil.WriteFile(path, []byte(stringValue), 0644)
+	err := os.WriteFile(path, []byte(stringValue), 0644)
 	if err != nil {
 		return fmt.Errorf("unable to write %v to %v: %w", value, path, err)
 	}
diff --git a/getwork.go b/getwork.go
index 6a0ca0f..52725e4 100644
--- a/getwork.go
+++ b/getwork.go
@@ -10,10 +10,11 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
-	"io/ioutil"
+	"io"
 	"math/big"
 	"net"
 	"net/http"
+	"os"
 	"strconv"
 	"time"
 
@@ -46,7 +47,7 @@ func newHTTPClient(cfg *config) (*http.Client, error) {
 	// Configure TLS if needed.
 	var tlsConfig *tls.Config
 	if !cfg.NoTLS && cfg.RPCCert != "" {
-		pem, err := ioutil.ReadFile(cfg.RPCCert)
+		pem, err := os.ReadFile(cfg.RPCCert)
 		if err != nil {
 			return nil, err
 		}
@@ -125,7 +126,7 @@ func GetWork() (*work.Work, error) {
 		return nil, err
 	}
 
-	body, err := ioutil.ReadAll(httpResponse.Body)
+	body, err := io.ReadAll(httpResponse.Body)
 	httpResponse.Body.Close()
 	if err != nil {
 		err = fmt.Errorf("error reading json reply: %w", err)
@@ -242,7 +243,7 @@ func GetWorkSubmit(data []byte) (bool, error) {
 		return false, err
 	}
 
-	body, err := ioutil.ReadAll(httpResponse.Body)
+	body, err := io.ReadAll(httpResponse.Body)
 	httpResponse.Body.Close()
 	if err != nil {
 		err = fmt.Errorf("error reading json reply: %w", err)

From b94e8574760f022124c3cdcca14fc4e064fe12fa Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Mon, 11 Sep 2023 12:18:09 +0100
Subject: [PATCH 131/150] build: Enable a bunch of new linters.

---
 .golangci.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.golangci.yml b/.golangci.yml
index 10b101b..09d8c67 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -9,17 +9,36 @@ linters:
     - asciicheck
     - bidichk
     - bodyclose
+    - containedctx
+    - dupword
     - durationcheck
+    - errorlint
     - exportloopref
+    - godot
     - gofmt
     - goimports
     - gosimple
     - grouper
     - ineffassign
+    - makezero
     - misspell
     - nosprintfhostport
+    - prealloc
+    - predeclared
     - reassign
     - rowserrcheck
+    - revive
     - tparallel
+    - typecheck
     - unconvert
+    - unparam
     - unused
+
+linters-settings:
+  # While every other rule in the revive linter is useful, the var-naming rule
+  # is a bit too strict for this project because it does not allow underscores
+  # in names.
+  revive:
+    rules:
+      - name: var-naming
+        disabled: true

From d0dc35186afc282023e930965535c4d1195741ce Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Wed, 6 Sep 2023 15:43:02 -0300
Subject: [PATCH 132/150] cuda: Add specific int types to blake3 kernel.

---
 blake3.cu | 68 +++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/blake3.cu b/blake3.cu
index 8f928a0..9cda430 100644
--- a/blake3.cu
+++ b/blake3.cu
@@ -19,48 +19,48 @@
 
 __global__
 void search(
-    uint *output,
+    uint32_t *output,
     // Midstate.
-    const uint cv0,
-    const uint cv1,
-    const uint cv2,
-    const uint cv3,
-    const uint cv4,
-    const uint cv5,
-    const uint cv6,
-    const uint cv7,
+    const uint32_t cv0,
+    const uint32_t cv1,
+    const uint32_t cv2,
+    const uint32_t cv3,
+    const uint32_t cv4,
+    const uint32_t cv5,
+    const uint32_t cv6,
+    const uint32_t cv7,
 
     // Final 52 bytes of data.
-    const uint m0,
-    const uint m1,
-    const uint m2,
-    // const uint m3 : nonce
-    const uint m4,
-    const uint m5,
-    const uint m6,
-    const uint m7,
-    const uint m8,
-    const uint m9,
-    const uint m10,
-    const uint m11,
-    const uint m12)
+    const uint32_t m0,
+    const uint32_t m1,
+    const uint32_t m2,
+    // const uint32_t m3 : nonce
+    const uint32_t m4,
+    const uint32_t m5,
+    const uint32_t m6,
+    const uint32_t m7,
+    const uint32_t m8,
+    const uint32_t m9,
+    const uint32_t m10,
+    const uint32_t m11,
+    const uint32_t m12)
 {
     // Nonce.
-    const uint m3 = blockDim.x * blockIdx.x + threadIdx.x;
+    const uint32_t m3 = blockDim.x * blockIdx.x + threadIdx.x;
 
     // BLAKE3 init vectors.
-    const uint iv0 = 0x6a09e667ul;
-    const uint iv1 = 0xbb67ae85ul;
-    const uint iv2 = 0x3c6ef372ul;
-    const uint iv3 = 0xa54ff53aul;
-    // const uint iv4 = 0x510e527ful;
-    // const uint iv5 = 0x9b05688cul;
-    // const uint iv6 = 0x1f83d9abul;
-    // const uint iv7 = 0x5be0cd19ul;
+    const uint32_t iv0 = 0x6a09e667ul;
+    const uint32_t iv1 = 0xbb67ae85ul;
+    const uint32_t iv2 = 0x3c6ef372ul;
+    const uint32_t iv3 = 0xa54ff53aul;
+    // const uint32_t iv4 = 0x510e527ful;
+    // const uint32_t iv5 = 0x9b05688cul;
+    // const uint32_t iv6 = 0x1f83d9abul;
+    // const uint32_t iv7 = 0x5be0cd19ul;
 
     // Internal compression func state.
-    uint v0, v1, v2, v3, v4, v5, v6, v7;
-    uint v8, v9, v10, v11, v12, v13, v14, v15;
+    uint32_t v0, v1, v2, v3, v4, v5, v6, v7;
+    uint32_t v8, v9, v10, v11, v12, v13, v14, v15;
 
     // Do the initialization and first round together.
     // Round 1.
@@ -160,7 +160,7 @@ void search(
         return;
 
     // Update nonce.
-    uint pos = atomicInc(&output[0], 0xffffffff)+1;
+    uint32_t pos = atomicInc(&output[0], 0xffffffff)+1;
     if (pos > MAX_OUTPUT_RESULTS) return; // Bounds check output buffer.
     output[pos] = m3;
 }

From 83843adea288ef3aa4b07793db373f09a0b33770 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Fri, 8 Sep 2023 11:49:07 -0300
Subject: [PATCH 133/150] cuda: Switch to cuda_builder.go instead of Makefile.

This switches the method for generating compiled versions of the
CUDA-enabled Blake3 kernel from the existing GNUMakefile to use a go
generate based generator.

This is necessary to properly support Windows builds, removes the direct
need for an external dependency (make) and makes gominer follow a more
idiomatic Go method for building its releases.
---
 .gitignore            |   5 +
 GNUmakefile           |  59 ---------
 cgo_flags.go          |  12 --
 cuda_builder.go       | 285 ++++++++++++++++++++++++++++++++++++++++++
 cudakernel_static.go  |  11 --
 cudakernel_windows.go |  14 ---
 cudevice.go           |   8 ++
 7 files changed, 298 insertions(+), 96 deletions(-)
 delete mode 100644 GNUmakefile
 delete mode 100644 cgo_flags.go
 create mode 100644 cuda_builder.go
 delete mode 100644 cudakernel_static.go
 delete mode 100644 cudakernel_windows.go

diff --git a/.gitignore b/.gitignore
index 6fa35c5..57afad4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 # Folders
 .vscode/
 _obj
+obj/
 _test
 
 # Architecture specific extensions/prefixes
@@ -17,6 +18,9 @@ _test
 _cgo_defun.c
 _cgo_gotypes.go
 _cgo_export.*
+*.dll
+*.exp
+*.lib
 
 _testmain.go
 
@@ -25,6 +29,7 @@ _testmain.go
 *.prof
 clean.sh
 kernel/
+nvidia/
 
 *~
 gominer
diff --git a/GNUmakefile b/GNUmakefile
deleted file mode 100644
index 5dbc8d2..0000000
--- a/GNUmakefile
+++ /dev/null
@@ -1,59 +0,0 @@
-CC ?= gcc -fPIC
-CXX ?= g++ -fPIC
-NVCC ?= nvcc -Xcompiler -fPIC
-AR ?= ar
-# -o is gnu only so this needs to be smarter; it does work because on darwin it
-#  fails which is also not windows.
-ARCH:=$(shell uname -o)
-
-.DEFAULT_GOAL := build
-
-ifeq ($(ARCH),Msys)
-nvidia:
-endif
-
-# Windows needs additional setup and since cgo does not support spaces in
-# in include and library paths we copy it to the correct location.
-#
-# Windows build assumes that CUDA V7.0 is installed in its default location.
-#
-# Windows gominer requires nvml.dll and blake3-decred.dll to reside in the same
-# directory as gominer.exe.
-ifeq ($(ARCH),Msys)
-obj: nvidia
-	mkdir nvidia
-	cp -r /c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/* nvidia
-	cp -r /c/Program\ Files/NVIDIA\ Corporation/NVSMI nvidia
-else
-obj:
-endif
-	mkdir obj
-
-ifeq ($(ARCH),Msys)
-obj/blake3-decred.dll: obj blake3.cu
-	$(NVCC) --shared --optimize=3 --compiler-options=-GS-,-MD -I. blake3.cu -o obj/blake3-decred.dll
-else
-obj/blake3.a: obj blake3.cu
-	$(NVCC) --lib --optimize=3 -I. blake3.cu -o obj/blake3.a
-endif
-
-ifeq ($(ARCH),Msys)
-build: obj/blake3-decred.dll
-else
-build: obj/blake3.a
-endif
-	go build -tags 'cuda'
-
-ifeq ($(ARCH),Msys)
-install: obj/blake3-decred.dll
-else
-install: obj/blake3.a
-endif
-	go install -tags 'cuda'
-
-clean:
-	rm -rf obj
-	go clean
-ifeq ($(ARCH),Msys)
-	rm -rf nvidia
-endif
diff --git a/cgo_flags.go b/cgo_flags.go
deleted file mode 100644
index 39dfda4..0000000
--- a/cgo_flags.go
+++ /dev/null
@@ -1,12 +0,0 @@
-// Copyright (c) 2016 The Decred developers.
-
-//go:build cuda && !opencl
-// +build cuda,!opencl
-
-package main
-
-/*
-#cgo !windows LDFLAGS: -L/opt/cuda/lib64 -L/opt/cuda/lib -lcuda -lcudart -lstdc++ obj/blake3.a
-#cgo windows LDFLAGS: -Lobj -lblake3-decred -Lnvidia/CUDA/v7.0/lib/x64 -lcuda -lcudart -Lnvidia/NVSMI -lnvml
-*/
-import "C"
diff --git a/cuda_builder.go b/cuda_builder.go
new file mode 100644
index 0000000..2296972
--- /dev/null
+++ b/cuda_builder.go
@@ -0,0 +1,285 @@
+// Copyright (c) 2023 The Decred developers.
+//
+// Builder file for the CUDA-based version of gominer.
+//
+// Unfortunately, most of the complexity of this generator comes from the
+// Windows build, along with limitations of the cgo tool.
+//
+// For non-Windows OSes, generating an appropriate library to include in the
+// resulting binary is usually just a matter of installing the required
+// packages from the distribution's package manager and then calling the Nvidia
+// CUDA compiler (nvcc) which should automatically be in the $PATH environment
+// variable.
+//
+// On Windows however, the situation is much more complicated: nvcc only works
+// with cl.exe, which is included in the Desktop Development with C++ component
+// of Microsoft's Visual Studio software. OTOH, cl.exe has little to no support
+// as a compiler for cgo, thus gominer is expected to be built with gcc inside
+// an MSYS2 environment when building on Windows. Additionally, neither cl.exe
+// nor the required Nvidia headers and libraries are included by default on
+// the system's or user's environment variables.
+//
+// Thus, in order to attempt to reduce the need for manual setup of the entire
+// building environment, this generator makes guesses and uses tricks to
+// ease building gominer, specially on Windows platforms. To generate
+// CUDA-enabled gominer binaries, run:
+//
+//   go generate -tags cuda .
+//
+// Instead of the usual way of using go run/go build.
+
+// The following build constraint enforces this file is only built as a
+// consequence of running `go generate -tags cuda .`.
+//go:build cudabuilder
+// +build cudabuilder
+
+package main
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+)
+
+var (
+	errNotOnMsys        = errors.New("not on MSYS2 MINGW64 terminal")
+	errCudaPathEnvUnset = errors.New("CUDA_PATH environment variable not set")
+	errNoCudartDll      = errors.New("cudart64_xx.dll not found")
+	errVswhereFailed    = errors.New("vswhere failed to find MSVC install")
+	errNoMsDevToolsDir  = errors.New("no C++ desktop compiler tools found")
+)
+
+// runCmd runs the command until it ends and returns an error. Stdout and Stderr
+// are redirected to the process's own, instead of discarded.
+func runCmd(name string, args ...string) error {
+	cmd := exec.Command(name, args...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	return cmd.Run()
+}
+
+// runWithOutput runs the command until it ends and returns the output if no
+// errors are found.
+func runWithOutput(name string, args ...string) (string, error) {
+	cmd := exec.Command(name, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		if out != nil {
+			fmt.Println(string(out))
+		}
+		return "", err
+	}
+	return string(out), nil
+}
+
+// winPathToShort converts a windows filepath into its short filepath
+// representation on a best effort basis. Works only on absolute paths.
+//
+// This is a limited version of the Windows API function GetShortPathName(),
+// implemented here to avoid having to create platform specific files for
+// cuda_builder.go.
+//
+// This is used as a trick to get over the fact that CGO_CFLAGS and CGO_LDFLAGS
+// do not support paths with spaces in them.
+func winPathToShort(path string) string {
+	if len(path) < 3 || path[1:3] != `:\` {
+		return path
+	}
+
+	parts := strings.Split(path, `\`)
+	parts[0] = parts[0] + `\`
+	for i, p := range parts {
+		if len(p) <= 8 {
+			continue
+		}
+
+		// List contents of the parent dir and find the index of the entry with
+		// the needed prefix. If any error occurs or the parent does not exist
+		// or is empty, assume the index is 1. Supports up to 2 digits.
+		prefix := p[:5]
+		index := 1
+		parent := filepath.Join(parts[:i]...)
+		pattern := filepath.Join(parent, prefix+"*")
+		entries, _ := filepath.Glob(pattern)
+		if len(entries) < 9 {
+			prefix = p[:6]
+		}
+		for j := range entries {
+			if filepath.Base(entries[j]) == p {
+				index = j + 1
+				break
+			}
+		}
+
+		parts[i] = fmt.Sprintf("%s~%d", prefix, index)
+	}
+
+	return filepath.Join(parts...)
+}
+
+// checkRequirementsWindows checks for the requirements to a Windows build and
+// sets the environment variables as necessary.
+func checkRequirementsWindows() error {
+	// Ensure the user is in an MSYS2 MINGW64 terminal, otherwise go build
+	// won't use the correct C compiler.
+	if os.Getenv("MSYSTEM_CHOST") != "x86_64-w64-mingw32" {
+		return errNotOnMsys
+	}
+
+	// Ensure CUDA Toolkit is installed and fetch its path to add it to
+	// PATH, CGO_CFLAGS and CGO_LDFLAGS.
+	cudaPath := os.Getenv("CUDA_PATH")
+	if cudaPath == "" {
+		return errCudaPathEnvUnset
+	}
+
+	// Copy cudart_xx.dll to the current dir. The xx part depends on the
+	// installed version of the toolkit. This file is necessary for gominer
+	// to run and is not generally installed to the global system32 dir
+	// on Windows.
+	entries, err := filepath.Glob(cudaPath + `\bin\cudart64_*.dll`)
+	if err != nil {
+		return fmt.Errorf("%w due to %v", errNoCudartDll, err)
+	}
+	if len(entries) == 0 {
+		return errNoCudartDll
+	}
+	if err := runCmd("cp", entries[0], "."); err != nil {
+		return fmt.Errorf("unable to copy cudart64_xx.dll to current dir: %v", err)
+	}
+
+	// Ensure MSVC is installed and that there's an appropriate cl.exe. This
+	// attempts to find an install path for a 2017+ MSVC by querying
+	// vswhere.exe (which is installed in a well-known path), then reading
+	// the contents of a file that should have the default version of the
+	// desktop C++ compiler component, and then deriving the final path to
+	// the x64 version of such compiler.
+	//
+	// This was largely derived from the following vswhere guide:
+	// https://github.com/microsoft/vswhere/wiki/Find-VC
+	vswherePath := `C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe`
+	msvcRootPath, err := runWithOutput(vswherePath, "-latest", "-property", "installationPath")
+	if err != nil {
+		return fmt.Errorf("%w due to %v", errVswhereFailed, err)
+	} else if msvcRootPath = strings.TrimSpace(msvcRootPath); msvcRootPath == "" {
+		return errVswhereFailed
+	}
+	auxBuildPath := msvcRootPath + `\VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt`
+	vcppVersionBytes, err := os.ReadFile(auxBuildPath)
+	if err != nil {
+		return fmt.Errorf("%w due to %v", errNoMsDevToolsDir, err)
+	}
+	if vcppVersionBytes = bytes.TrimSpace(vcppVersionBytes); len(vcppVersionBytes) == 0 {
+		return fmt.Errorf("%w due to no version in Microsoft.VCToolsVersion.default.txt file",
+			errNoMsDevToolsDir)
+	}
+	clPath := msvcRootPath + `\VC\Tools\MSVC\` + string(vcppVersionBytes) + `\bin\Hostx64\x64`
+
+	// Populate the env vars that will be needed to build blake3.dll and
+	// gominer.exe.
+	cgoCflags := os.Getenv("CGO_CFLAGS")
+	cgoCflags = fmt.Sprintf("%s -I%s", cgoCflags, winPathToShort(cudaPath)+`\include`)
+	os.Setenv("CGO_CFLAGS", cgoCflags)
+
+	cgoLdflags := os.Getenv("CGO_LDFLAGS")
+	cgoLdflags = fmt.Sprintf("%s -L%s", cgoLdflags, winPathToShort(cudaPath)+`\lib\x64`)
+	os.Setenv("CGO_LDFLAGS", cgoLdflags)
+
+	path := os.Getenv("PATH")
+	path = fmt.Sprintf("%s;%s", cudaPath+`\bin`, path)
+	path = fmt.Sprintf("%s;%s", clPath, path)
+	os.Setenv("PATH", path)
+
+	return nil
+}
+
+// checkRequirementsDefault checks the requirements for any other OS/architecture
+// combinations.
+func checkRequirementsDefault() error {
+	return nil
+}
+
+// buildBlake3Windows builds the blake3.dll library with a compiled version of
+// the Blake3 kernel.
+func buildBlake3Windows() error {
+	return runCmd("nvcc", "--shared", "--optimize=3", "--compiler-options=-GS-,-MD",
+		"-I.", "blake3.cu", "-o", "blake3.dll")
+}
+
+// buildBlake3Default builds the blake3.a library with a compiled version of the
+// Blake3 kernel.
+func buildBlake3Default() error {
+	return runCmd("nvcc", "--lib", "--optimize=3",
+		"-I.", "blake3.cu", "-o", "obj/blake3.a")
+}
+
+// buildGominer builds the necessary platform-specific dependencies and then
+// builds the final gominer binary.
+func buildGominer() error {
+	// Determine the platform-specific functions.
+	checkRequirements := checkRequirementsDefault
+	buildBlake3 := buildBlake3Default
+	if runtime.GOOS == "windows" {
+		checkRequirements = checkRequirementsWindows
+		buildBlake3 = buildBlake3Windows
+	}
+
+	if err := checkRequirements(); err != nil {
+		return err
+	}
+	if err := buildBlake3(); err != nil {
+		return err
+	}
+
+	// Build final gominer binary.
+	if err := runCmd("go", "build", "-tags", "cuda", "."); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func main() {
+	err := buildGominer()
+	if err == nil {
+		return
+	}
+
+	p := func(format string, args ...interface{}) {
+		fmt.Fprintf(os.Stderr, format, args...)
+		fmt.Fprintf(os.Stderr, "\n")
+	}
+	p("Error generating blake3 CUDA library: %v", err.Error())
+
+	// Offer some advice to the user if we can.
+	switch {
+	case errors.Is(err, errNotOnMsys):
+		p("")
+		p("On Windows, this needs to be run from an MSYS2 MINGW64 terminal.")
+		p("Install MSYS2 (if not already installed), then look for the 'MSYS2 MINGW64' " +
+			"entry in the start menu to open the appropriate terminal, then try again")
+
+	case errors.Is(err, errNoCudartDll):
+		p("")
+		p("This means a suitable Nvidia CUDA Toolkit is not installed, an incompatible " +
+			"version is installed or the installation is corrupt.")
+
+	case errors.Is(err, errVswhereFailed):
+		p("")
+		p("This usually means Microsoft Visual Studio is not installed or a version " +
+			"older than MSVC 2017 is installed. Install a recent version of MSVC, available " +
+			"at https://visualstudio.microsoft.com/vs/community/")
+
+	case errors.Is(err, errNoMsDevToolsDir):
+		p("")
+		p("This usually means that the \"Desktop Development with C++\" component " +
+			"of MSVC is not installed. Re-run the MSVC installer and add this " +
+			"component, then try again.")
+	}
+	os.Exit(1)
+}
diff --git a/cudakernel_static.go b/cudakernel_static.go
deleted file mode 100644
index 30360db..0000000
--- a/cudakernel_static.go
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) 2016 The Decred developers.
-
-//go:build (linux && cuda) || (darwin && cuda)
-// +build linux,cuda darwin,cuda
-
-package main
-
-/*
-#include "decred.h"
-*/
-import "C"
diff --git a/cudakernel_windows.go b/cudakernel_windows.go
deleted file mode 100644
index 3e3b028..0000000
--- a/cudakernel_windows.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) 2016 The Decred developers.
-//go:build cuda
-// +build cuda
-
-package main
-
-import (
-	"syscall"
-)
-
-var (
-	kernelDll      = syscall.MustLoadDLL("blake3-decred.dll")
-	kernelProcAddr = kernelDll.MustFindProc("decred_blake3_hash").Addr()
-)
diff --git a/cudevice.go b/cudevice.go
index 79b5868..ee37508 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -5,8 +5,16 @@
 
 package main
 
+// The following go:generate directive produces the appropriate intermediate
+// library with the Blake3 CUDA kernel for use with gominer as a result of
+// executing `go generate -tags cuda .`.
+//go:generate go run -tags "cudabuilder" cuda_builder.go
+
 /*
 #include "decred.h"
+
+#cgo !windows LDFLAGS: obj/blake3.a
+#cgo windows LDFLAGS: -L. -lblake3
 */
 import "C"
 

From 7e2fb9e40e5568d4c85636c7c7ef1474eb44cf6c Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Fri, 8 Sep 2023 13:31:42 -0300
Subject: [PATCH 134/150] README: Add CUDA building instructions.

---
 README.md                         | 171 ++++++++++++++++++++++++------
 docs/cuda-manual-windows-build.md | 155 +++++++++++++++++++++++++++
 2 files changed, 291 insertions(+), 35 deletions(-)
 create mode 100644 docs/cuda-manual-windows-build.md

diff --git a/README.md b/README.md
index c2bf2e7..a4f05e8 100644
--- a/README.md
+++ b/README.md
@@ -115,33 +115,44 @@ Gominer works with OpenCL (both AMD and NVIDIA) and CUDA (NVIDIA only).  At the
 current time, most users have reported that OpenCL gives them higher hashrates
 on NVIDIA.
 
-**NOTE: Although gominer works with CUDA, there are not any build instructions
-yet.  They will be provided at a later date**.
-
 Once you decide on OpenCL or CUDA, you will need to install the
 graphics driver for your GPU as well as the headers for OpenCL or CUDA
 depending on your choice.
 
 The exact packages are dependent on the specific Linux distribution, but,
-generally speaking, you will need the latest AMDGPU-PRO display drivers for
-AMD cards and the latest NVIDIA graphics display drivers for NVIDIA cards.
+generally speaking, you will need the latest AMDGPU-PRO display drivers for AMD
+cards and the latest NVIDIA graphics display drivers for NVIDIA cards.  Then,
+depending on whether you will build the OpenCL or CUDA version, the specific set
+of toolsets, headers and libraries will have to be installed.
 
-You will also need the OpenCL headers which is typically named something
-similar to `mesa-opencl-dev` (for AMD) or `nvidia-opencl-dev` (for NVIDIA).
+For OpenCL, the packages are typically named something similar to
+`mesa-opencl-dev` (for AMD) or `nvidia-opencl-dev` (for NVIDIA).
 
 If you're using OpenCL, it is also recommended to install your distribution's
 equivalent of the `clinfo` package if you have any issues to ensure your device
 can be detected by OpenCL.  When `clinfo` is unable to detect your device,
 `gominer` will not be able to either.
 
-The following sections provide instructions for these combinations:
+For CUDA, on distributions where it is available via the standard package
+manager, the required files are usually found as `nvidia-cuda-toolkit`.  NVIDIA
+also provides its own [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)
+downloads.
+
+The following sections provide instructions for building various combinations
+of `gominer`:
+
+* [NVIDIA on Ubuntu 23.04](#nvidia-on-ubuntu-2304)
+* [Debian Bookworm](#debian-bookworm)
 
-* [OpenCL for NVIDIA on Ubuntu 23.04](#opencl-with-nvidia-on-ubuntu-2304)
-* [OpenCL for AMD on Debian Bookworm](#opencl-with-amd-on-debian-bookworm)
+#### NVIDIA on Ubuntu 23.04
 
-#### OpenCL Build Instructions (Works with Both NVIDIA and AMD)
+This section provides instructions for building `gominer` on a computer with an
+NVIDIA graphics card running Ubuntu 23.04.  Both OpenCL and CUDA build
+instructions are provided.
 
-##### OpenCL with NVIDIA on Ubuntu 23.04
+##### Pre-requisites
+
+The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 
 - Detect the model of your NVIDIA GPU and the recommended driver
   - `ubuntu-drivers devices`
@@ -150,12 +161,20 @@ The following sections provide instructions for these combinations:
     - `sudo ubuntu-drivers autoinstall`
   - **Alternatively, install a specific driver (for example)**
     - `sudo apt install nvidia-driver-525-server`
+- Install the basic development tools `git` and `go`
+  - `sudo apt install git golang`
 - Reboot to allow the graphics driver to load
   - `sudo reboot`
-- Install the OpenCL headers, `git` and `go`
-  - `sudo apt install nvidia-opencl-dev git golang`
 - Obtain the `gominer` source code
   - `git clone https://github.com/decred/gominer`
+- Jump to the appropriate section for either
+  [OpenCL](#opencl-on-ubuntu) or [CUDA](#cuda-on-ubuntu) 
+  depending on which GPU library you want to build `gominer` for.
+
+###### OpenCL on Ubuntu
+
+- Install the OpenCL headers
+  - `sudo apt install nvidia-opencl-dev`
 - Build `gominer`
   - `cd gominer`
   - `go build -tags opencl`
@@ -163,7 +182,24 @@ The following sections provide instructions for these combinations:
   - `./gominer -l`
 - You may now [configure and run](#configuring-gominer) `gominer`
 
-##### OpenCL with AMD on Debian Bookworm
+###### CUDA on Ubuntu
+
+- Install the NVIDIA CUDA Toolkit:
+  - `sudo apt install nvidia-cuda-toolkit`
+- Build `gominer`:
+  - `cd gominer`
+  - `go generate -tags cuda .`
+- Test `gominer` detects your GPU(s):
+  - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
+
+#### Debian Bookworm
+
+This section provides instructions for building `gominer` on a computer running
+Debian bookworm.  Both OpenCL (using either AMD or an NVIDIA graphics cards)
+and CUDA (NVIDIA graphics cards only) build instructions are provided.
+
+##### Pre-requisites
 
 - Enable the non-free (closed source) repository by using your favorite editor
   to modify `/etc/apt/sources.list` and appending `contrib non-free` to the
@@ -176,14 +212,32 @@ The following sections provide instructions for these combinations:
       ```
 - Update the Apt package manager with the new sources
   - `sudo apt update`
-- Install the AMD graphics driver and supporting firmware
+- Install the basic development tools `git` and `go`:
+  - `sudo apt install git golang`
+- Obtain the `gominer` source code
+  - `git clone https://github.com/decred/gominer`
+
+Proceed to install the appropriate graphics card driver and supporting firmware,
+based on the hardware available on the computer:
+
+- For AMD GPUs: Install the AMD graphics driver and supporting firmware
   - `sudo apt install firmware-linux firmware-linux-nonfree libdrm-amdgpu1 xserver-xorg-video-amdgpu`
-- Install the OpenCL headers, OpenCL Installable Client driver, OpenCL lib, `git` and `go`
-  - `sudo apt install opencl-headers mesa-opencl-icd ocl-icd-libopencl1 git golang`
+- For NVIDIA GPUs: Install the NVIDIA graphics driver:
+  - `sudo apt install nvidia-driver`
+- Restart the computer to ensure the driver is loaded
+- Jump to the appropriate section for either
+  [OpenCL](#opencl-on-debian) or [CUDA](#cuda-on-debian) 
+  depending on which GPU library you want to build `gominer` for.
+
+
+###### OpenCL on Debian
+
+This build mode supports both AMD and NVIDIA graphics cards.
+
+- Install the OpenCL headers, OpenCL Installable Client driver and OpenCL lib
+  - `sudo apt install opencl-headers mesa-opencl-icd ocl-icd-libopencl1`
 - Help the loader find the OpenCL library by creating a symbolic link to it:
   - `ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/libOpenCL.so`
-- Obtain the `gominer` source code
-  - `git clone https://github.com/decred/gominer`
 - Build `gominer`
   - `cd gominer`
   - `go build -tags opencl`
@@ -191,16 +245,26 @@ The following sections provide instructions for these combinations:
   - `./gominer -l`
 - You may now [configure and run](#configuring-gominer) `gominer`
 
-#### CUDA Build Instructions (NVIDIA only)
 
-**Build instructions are not available yet.  They will be provided at a later
-date**.
+###### CUDA on Debian
+
+Note that this requires having an NVIDIA graphics card installed on the
+computer.
+
+- Install the NVIDIA CUDA Toolkit:
+  - `sudo apt install nvidia-cuda-toolkit`
+- Build `gominer`:
+  - `cd gominer`
+  - `go generate -tags cuda .`
+- Test `gominer` detects your GPU(s):
+  - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 ### Windows
 
-#### OpenCL Build Instructions (Works with Both NVIDIA and AMD)
+#### Windows Pre-requisites
 
-##### OpenCL Pre-Requisites
+The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 
 - Download and install [MSYS2](https://www.msys2.org/)
   - Make sure you uncheck `Run MSYS2 now.`
@@ -209,15 +273,26 @@ date**.
     you didn't uncheck `Run MSYS2 now` as instructed.  That shell will not work,
     so close it if you forgot to uncheck it in the installer.
 - From within the `MSYS2 MINGW64` shell enter the following commands to install
-  `gcc`, `git`, `go`, `unzip`, the light OpenCL SDK, and the `gominer` source code
+  `gcc`, `git`, `go`, `unzip`:
   - `pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-tools mingw-w64-x86_64-go git unzip`
-  - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
-  - `unzip -d /c/appsdk lightOCLSDK.zip`
   - `git clone https://github.com/decred/gominer`
 - **Close the `MSYS2 MINGW64` shell and relaunch it**
   - NOTE: This is necessary to ensure all of the new environment variables are set properly
-- Jump to the appropriate section for either [NVIDIA](#opencl-with-nvidia) or
-  [AMD](#opencl-with-amd) depending on which type of GPU you have
+- Jump to the appropriate section for either
+  [OpenCL](#opencl-pre-requisites-on-windows), or [CUDA with NVIDIA](#cuda-with-nvidia) 
+  depending on which type of GPU you have
+
+##### OpenCL Pre-requisites on Windows
+
+The following is needed when performing an OpenCL build:
+
+- Still in the `MSYS2 MINGW64` shell enter the following commands to install 
+  the light OpenCL SDK:
+  - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
+  - `unzip -d /c/appsdk lightOCLSDK.zip`
+- Jump to the appropriate section for either [OpenCL with NVIDIA](#opencl-with-nvidia),
+  or [OpenCL with AMD](#opencl-with-amd) depending on which type of GPU you have
+
 
 ##### OpenCL with NVIDIA
 
@@ -243,10 +318,36 @@ date**.
   - `./gominer -l`
 - You may now [configure and run](#configuring-gominer) `gominer`
 
-#### CUDA Build Instructions (NVIDIA only)
-
-**NOTE**: The CUDA version of the `gominer` is not yet compatible with
-Windows.
+#### CUDA with NVIDIA
+
+Building the CUDA-enabled `gominer` on a Windows platform is tricky, requires
+several GB worth of downloads and while we have made attempts at detecting the
+necessary tools and automating the building process, it is not guaranteed to
+work, in particular as newer or older versions of the various tools are
+installed.
+
+This guide has been tested on a Windows 10 machine, with an NVIDIA graphics card
+installed, using Microsoft Visual Studio Community Edition 2022 and NVIDIA CUDA
+Toolkit version 12.2.  If the automatic builder for `gominer` does not work on
+your system, you many need to [manually setup the various
+tools](/docs/cuda-manual-windows-build.md).
+
+After fulfilling the [Windows pre-requisites](#windows-pre-requisites), follow 
+the following instructions:
+
+- Download and install the appropriate NVIDIA driver
+  - https://www.nvidia.com/download/index.aspx
+- Download and install the NVIDIA CUDA Toolkit:
+  - https://developer.nvidia.com/cuda-toolkit
+- Download and install Microsoft Visual Studio:
+  - https://visualstudio.microsoft.com/vs/community/
+  - Ensure the "Desktop Development with C++" component will be installed
+- Build gominer:
+  - `go generate -tags cuda .`
+  - The warnings about deprecated symbols are safe to ignore
+- Test `gominer` detects your GPU(s):
+  - `./gominer -l`
+- You may now [configure and run](#configuring-gominer) `gominer`
 
 ## User Reported Hashrates
 
@@ -267,4 +368,4 @@ NVIDIA Tesla V100S     | 14.6 Gh/s
 NVIDIA RTX 4070        | 14.9 Gh/s
 NVIDIA RTX 3080        | 15.2 Gh/s
 NVIDIA RTX 3090        | 17.6 Gh/s
-AMD 7900 XTX           | 27.2 Gh/s
\ No newline at end of file
+AMD 7900 XTX           | 23.8 Gh/s
diff --git a/docs/cuda-manual-windows-build.md b/docs/cuda-manual-windows-build.md
new file mode 100644
index 0000000..a99df57
--- /dev/null
+++ b/docs/cuda-manual-windows-build.md
@@ -0,0 +1,155 @@
+# Manual CUDA Build on Windows
+
+Building the CUDA-enabled `gominer` on a Windows platform is tricky, requires
+several GB worth of downloads and the [automatic builder
+script](/cuda_builder.go) that is called when running `go generate -tags cuda`
+is not guaranteed to work on every combination of setups.
+
+The main difficulties when building the CUDA version on windows are:
+
+- The installers for CUDA Toolkit and MSVC do not put their respective binaries
+  in the user's `$PATH` environment variable.
+- A default installation of MSYS2 ignores the global and user's `$PATH` anyway.
+- Having files in `c:\Program Files\` (path with a space) causes all sorts of
+  issues in all layers of the compilation stack.  In particular, [cgo directives
+  do not work](https://github.com/golang/go/issues/45637) with such paths.
+- An [outdated CUDA package](https://github.com/barnex/cuda5) currently used by
+  `gominer` has wrong cgo include paths in any case.
+- CUDA requires `cl.exe` (MSVC compiler) to build its binary kernels, but Go
+  does not accept it as a possible `cgo` compiler.
+
+
+All of these must be resolved in other to correctly build the CUDA version of
+`gominer` on Windows.  The automatic builder script uses a few tricks and some
+reasonable assumptions to attempt to solve building for the most common install
+scenarios.  But for the cases where that is not sufficient, the following
+document lists a general procedure for setting the required Windows environment.
+
+## Install the Required Tools
+
+- Download and install the appropriate NVIDIA driver
+  - https://www.nvidia.com/download/index.aspx
+- Download and install the NVIDIA CUDA Toolkit:
+  - https://developer.nvidia.com/cuda-toolkit
+- Download and install Microsoft Visual Studio:
+  - https://visualstudio.microsoft.com/vs/community/
+  - Ensure the "Desktop Development with C++" component will be installed
+
+Restart the computer and ensure Windows can recognize the NVIDIA GPU as such.
+
+## Setup MSYS2
+
+While it _may_ be possible to build on PowerShell or even `cmd.exe`, going
+through MSYS2 means the changes are more explicit and generally don't require
+clicking through GUIs to find settings.
+
+- Download and install [MSYS2](https://www.msys2.org/)
+  - Make sure you uncheck `Run MSYS2 now.`
+- Launch the `MSYS2 MINGW64` shell from the start menu
+  - NOTE: The `MSYS2` installer will launch the `UCRT64` shell by default if
+    you didn't uncheck `Run MSYS2 now` as instructed.  That shell will not work,
+    so close it if you forgot to uncheck it in the installer.
+- From within the `MSYS2 MINGW64` shell enter the following commands to install
+  `gcc`, `git`, `go`, `unzip`:
+  - `pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-tools mingw-w64-x86_64-go git unzip`
+  - `git clone https://github.com/decred/gominer`
+- **Close the `MSYS2 MINGW64` shell and relaunch it**
+  - NOTE: This is necessary to ensure all of the new environment variables are set properly
+
+
+## Create junction to NVIDIA Toolkit
+
+Now comes the tricky bit. `cgo` flags don't currently support paths with spaces
+in them. Therefore, we'll create a junction to the NVIDIA Toolkit install path
+in order to be able to specify it.
+
+- Determine the NVIDIA Toolkit install path.
+  - This is generally `c:\Program Files\NVIDIA GPU Toolkit\CUDA\vXX.Y`
+- Create a junction for the target directory from within MSYS2:
+  - `cd gominer`
+  - `cmd.exe //c mklink //J nvidia "c:\Program Files\NVIDIA GPU Toolkit\CUDA\vXX.Y"`
+  - Replace `vXX.Y` with the version of the toolkit actually installed
+- Copy the file `bin/cudart64_xx.dll` to `gominer`'s dir
+  - This file will be needed by `gominer` once built
+  - Replace XX with the major CUDA toolkit version
+  - `cp nvidia/bin/cudart64_*.dll .`
+
+If the toolkit is ever updated to a newer version, repeat this process. To
+remove the junction after it is no longer needed, execute `rm nvidia`
+
+
+## Determine the MSVC `cl.exe` location
+
+Visual Studio versions after 2017 support multiple installs on the same machine,
+each with different sets of components and features installed.  Determine which
+version contains the "Desktop Development with C++" component you wish to use
+and locate the path to its `cl.exe` compiler.
+
+For example, for Visual Studio 2022 Community edition, building x64 targets
+in x64 host computers (the most common case), the path to `cl.exe` is
+`C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.37.32822\bin\Hostx64\x64`.
+
+The important bit is that this dir *MUST* contain the `cl.exe` compiler. Modify
+the various versions as applicable to your specific computer.  Note down this
+dir, as it will be used in the next step.
+
+## Setup the Environment Variables
+
+Edit MSYS2's `.bash_profile` file to add the required environment variables.
+Any text editor may be used, as long as it supports Unix style line endings:
+
+- Edit `~/.bash_profile`
+  - If using a native Windows editor, the file is generally located at
+    `C:\msys64\home\[username]`
+- Add the following lines at the end of the file:
+
+```
+export CL_PATH="<path-to-cl.exe-found-in-previous-step>"
+export CGO_CFLAGS="$CGO_CFLAGS -Invidia/include"
+export CGO_LDFLAGS="$CGO_CFLAGS -Lnvidia/lib"
+export PATH="$CL_PATH:$PATH"
+```
+
+Restart the MSYS2 shell after editing the file. To know whether the values are
+being used, issue `echo` commands (for example, `echo $CGO_CFLAGS`).
+
+## Build `blake3.dll` using `nvcc`
+
+Now with the environment fully setup, the Blake3 CUDA kernel may be built:
+
+```
+$ nvcc --shared --optimize=3 --compiler-options=-GS-,-MD -I. blake3.cu -o blake3.dll
+```
+
+The end result is that a `blake3.dll` shared library should be created in
+`gominer`'s root dir.
+
+Several warnings may be displayed, (specially regarding use of deprecated calls)
+but those may be safely ignored.
+
+## Build `gominer.exe`
+
+Finally, with `blake3.dll` created, `gominer` may be buit in the standard way:
+
+```
+$ go build -tags cuda .
+$ ./gominer -l
+```
+
+Continue with the standard [configure and run](../#configuring-gominer) procedure.
+
+### Troubleshooting `gominer.exe` does not run
+
+If after building `gominer.exe`, it exists in the dir but executing (with `-l`
+or `-h`) does not display anything in the terminal, this is usually a sign of
+missing dlls.
+
+Run the following command:
+
+```
+$ ldd gominer.exe
+```
+
+And search the output for any DLLs listed as "not found". Install any missing
+dependency or copy the DLL directly to `gominer`s dir.
+

From a06968369d18f65c6f0e0124045959d5793a38b8 Mon Sep 17 00:00:00 2001
From: Dave Collins <dave@davec.name>
Date: Fri, 15 Sep 2023 11:18:22 -0500
Subject: [PATCH 135/150] README: Grammar nits and add windows prelim. (#211)

This addresses some grammar nits in the README, adds a Windows
preliminaries section to provide a little bit more guidance on choosing
between OpenCL and CUDA, and reorders the Windows sections so they are
consistent with the ordering throughout.
---
 README.md | 53 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index a4f05e8..595078b 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ The following sections provide instructions for building various combinations
 of `gominer`:
 
 * [NVIDIA on Ubuntu 23.04](#nvidia-on-ubuntu-2304)
-* [Debian Bookworm](#debian-bookworm)
+* [AMD and NVIDIA on Debian Bookworm](#debian-bookworm)
 
 #### NVIDIA on Ubuntu 23.04
 
@@ -150,7 +150,7 @@ This section provides instructions for building `gominer` on a computer with an
 NVIDIA graphics card running Ubuntu 23.04.  Both OpenCL and CUDA build
 instructions are provided.
 
-##### Pre-requisites
+##### Prerequisites
 
 The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 
@@ -169,7 +169,7 @@ The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
   - `git clone https://github.com/decred/gominer`
 - Jump to the appropriate section for either
   [OpenCL](#opencl-on-ubuntu) or [CUDA](#cuda-on-ubuntu) 
-  depending on which GPU library you want to build `gominer` for.
+  depending on which GPU library you want to build `gominer` for
 
 ###### OpenCL on Ubuntu
 
@@ -196,10 +196,10 @@ The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 #### Debian Bookworm
 
 This section provides instructions for building `gominer` on a computer running
-Debian bookworm.  Both OpenCL (using either AMD or an NVIDIA graphics cards)
+Debian bookworm.  Both OpenCL (using either AMD or NVIDIA graphics cards)
 and CUDA (NVIDIA graphics cards only) build instructions are provided.
 
-##### Pre-requisites
+##### Prerequisites
 
 - Enable the non-free (closed source) repository by using your favorite editor
   to modify `/etc/apt/sources.list` and appending `contrib non-free` to the
@@ -227,7 +227,7 @@ based on the hardware available on the computer:
 - Restart the computer to ensure the driver is loaded
 - Jump to the appropriate section for either
   [OpenCL](#opencl-on-debian) or [CUDA](#cuda-on-debian) 
-  depending on which GPU library you want to build `gominer` for.
+  depending on which GPU library you want to build `gominer` for
 
 
 ###### OpenCL on Debian
@@ -262,7 +262,17 @@ computer.
 
 ### Windows
 
-#### Windows Pre-requisites
+#### Windows Preliminaries
+
+Gominer works with OpenCL (both AMD and NVIDIA) and CUDA (NVIDIA only).
+
+At the current time, most users have reported that OpenCL gives them higher
+hashrates on NVIDIA.  Additionally, building the CUDA-enabled version of
+`gominer` on Windows is a much more involved process.  For these reasons, unless
+you really want to run the CUDA version for a specific reason, it is recommended
+to use OpenCL.
+
+#### Windows Prerequisites
 
 The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 
@@ -279,10 +289,10 @@ The following steps are applicable for both OpenCL and CUDA builds of `gominer`:
 - **Close the `MSYS2 MINGW64` shell and relaunch it**
   - NOTE: This is necessary to ensure all of the new environment variables are set properly
 - Jump to the appropriate section for either
-  [OpenCL](#opencl-pre-requisites-on-windows), or [CUDA with NVIDIA](#cuda-with-nvidia) 
-  depending on which type of GPU you have
+  [OpenCL](#opencl-prerequisites-on-windows) or [CUDA](#cuda-with-nvidia)
+  depending on which GPU library you want to build `gominer` for
 
-##### OpenCL Pre-requisites on Windows
+##### OpenCL Prerequisites on Windows
 
 The following is needed when performing an OpenCL build:
 
@@ -290,12 +300,17 @@ The following is needed when performing an OpenCL build:
   the light OpenCL SDK:
   - `wget https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/files/1406216/lightOCLSDK.zip`
   - `unzip -d /c/appsdk lightOCLSDK.zip`
-- Jump to the appropriate section for either [OpenCL with NVIDIA](#opencl-with-nvidia),
-  or [OpenCL with AMD](#opencl-with-amd) depending on which type of GPU you have
+- Jump to the appropriate section for either [OpenCL with AMD](#opencl-with-amd) or
+  [OpenCL with NVIDIA](#opencl-with-nvidia) depending on which type of GPU you have
 
+##### OpenCL with AMD
 
-##### OpenCL with NVIDIA
-
+- Change to the library directory C:\appsdk\lib\x86_64
+  * `cd /c/appsdk/lib/x86_64`
+- Copy and prepare the AMD Display Library (ADL) for linking
+  - `cp /c/Windows/SysWOW64/atiadlxx.dll .`
+  - `gendef atiadlxx.dll`
+  - `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
 - Build gominer
   - `cd ~/gominer`
   - `go build -tags opencl`
@@ -303,14 +318,8 @@ The following is needed when performing an OpenCL build:
   - `./gominer -l`
 - You may now [configure and run](#configuring-gominer) `gominer`
 
-##### OpenCL with AMD
+##### OpenCL with NVIDIA
 
-- Change to the library directory C:\appsdk\lib\x86_64
-  * `cd /c/appsdk/lib/x86_64`
-- Copy and prepare the AMD Display Library (ADL) for linking
-  - `cp /c/Windows/SysWOW64/atiadlxx.dll .`
-  - `gendef atiadlxx.dll`
-  - `dlltool --output-lib libatiadlxx.a --input-def atiadlxx.def`
 - Build gominer
   - `cd ~/gominer`
   - `go build -tags opencl`
@@ -332,7 +341,7 @@ Toolkit version 12.2.  If the automatic builder for `gominer` does not work on
 your system, you many need to [manually setup the various
 tools](/docs/cuda-manual-windows-build.md).
 
-After fulfilling the [Windows pre-requisites](#windows-pre-requisites), follow 
+After fulfilling the [Windows prerequisites](#windows-prerequisites), follow
 the following instructions:
 
 - Download and install the appropriate NVIDIA driver

From f6eaef347f561328c4b8b5ee1a2389121b74a69c Mon Sep 17 00:00:00 2001
From: jholdstock <jholdstock@decred.org>
Date: Sat, 16 Sep 2023 10:58:12 +0100
Subject: [PATCH 136/150] Fix build.

Code is not currently building because of a missing import and a misused
channel in cudevice.go.
---
 cudevice.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cudevice.go b/cudevice.go
index ee37508..acde3d1 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -19,6 +19,7 @@ package main
 import "C"
 
 import (
+	"context"
 	"fmt"
 	"math"
 	"math/bits"
@@ -368,7 +369,7 @@ func (d *Device) runDevice(ctx context.Context) error {
 		d.updateCurrentWork(ctx)
 
 		select {
-		case <-ctxDoneCh():
+		case <-ctxDoneCh:
 			return nil
 		default:
 		}

From 9f1c3f088802b51ca9b94d83d6265f0533df7ae7 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Mon, 18 Sep 2023 08:14:08 -0300
Subject: [PATCH 137/150] cuda: Create obj dir in generator script.

This adds creation of the obj/ dir, which is needed to generate the
version of the Blake3 kernel used in CUDA builds.

The Windows version no longer needs the obj/ dir, therefore that is not
necessary on that platform.
---
 cuda_builder.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_builder.go b/cuda_builder.go
index 2296972..b910e81 100644
--- a/cuda_builder.go
+++ b/cuda_builder.go
@@ -201,7 +201,8 @@ func checkRequirementsWindows() error {
 // checkRequirementsDefault checks the requirements for any other OS/architecture
 // combinations.
 func checkRequirementsDefault() error {
-	return nil
+	// Create the obj/ dir.
+	return os.MkdirAll("obj", 0o755)
 }
 
 // buildBlake3Windows builds the blake3.dll library with a compiled version of

From 1c92e04b59ef55bdbf1447189f6d233e9148858a Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Mon, 18 Sep 2023 09:40:47 -0300
Subject: [PATCH 138/150] cuda: Support CUDA toolkit versions older than 10.

This commit adds a conditional macro to the Blake3 CUDA kernel so that
on older CUDA Toolkit versions a shift-based rotation is used instead of
the intrinsic.

According to the CUDA Toolkit documentation[1], the __funnelshift_rc
integer intrinsic was introduced on CUDA version 10. Prior to this
commit, the CUDA kernel of gominer could not be built when using
Toolkit versions older than 10.

Note that although this makes it possible to build the kenel on older
Toolkit versions, performance is significantly degraded when compared to
the modern, intrinsic-based versions.

[1]: https://docs.nvidia.com/cuda/archive/10.0/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html
---
 blake3.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/blake3.cu b/blake3.cu
index 9cda430..5e43be3 100644
--- a/blake3.cu
+++ b/blake3.cu
@@ -15,7 +15,12 @@
 #define MAX_OUTPUT_RESULTS 32
 
 // Written and optimized by Dave Collins Sep 2023.
+#if __CUDACC_VER_MAJOR__ >= 10
 #define ROTR(v, n) __funnelshift_rc((v), (v), n)
+#else
+#define ROTR(v, n) ((v) >> n) | ((v) << (32 - n))
+#endif
+
 
 __global__
 void search(

From 724ef5b0e6c25410e029d23cd97e2fdd419da9ce Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Tue, 26 Sep 2023 10:25:24 -0300
Subject: [PATCH 139/150] cuda: Fix device initialization.

Previously, autocalibration could cause the device to still be in use
when the main runDevice() call is issued.

This commit fixes the bug by ensuring the CUDA device is reset
before/after autocalibration as well as on the main runDevice() call.
---
 cudevice.go | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/cudevice.go b/cudevice.go
index acde3d1..1383ab7 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -316,12 +316,15 @@ func (d *Device) runDevice(ctx context.Context) error {
 		return err
 	}
 
-	// Need to have this stuff here for a device vs thread issue.
+	// Setup the device settings.
 	runtime.LockOSThread()
-
 	cu.DeviceReset()
 	cu.SetDevice(d.cuDeviceID)
 	cu.SetDeviceFlags(cu.DeviceScheduleBlockingSync)
+	defer func() {
+		runtime.UnlockOSThread()
+		cu.DeviceReset()
+	}()
 
 	// kernel is built with nvcc, not an api call so must be done
 	// at compile time.
@@ -484,6 +487,17 @@ func (d *Device) getKernelExecutionTime(gridSize, threadCount uint32) (time.Dura
 // calcWorkSizeForMilliseconds calculates the correct worksize to achieve
 // a device execution cycle of the passed duration in milliseconds.
 func (d *Device) calcGridSizeForMilliseconds(ms int, threadCount uint32) (uint32, error) {
+
+	// Setup the device settings.
+	runtime.LockOSThread()
+	cu.DeviceReset()
+	cu.SetDevice(d.cuDeviceID)
+	cu.SetDeviceFlags(cu.DeviceScheduleBlockingSync)
+	defer func() {
+		runtime.UnlockOSThread()
+		cu.DeviceReset()
+	}()
+
 	gridSize := uint32(32)
 	timeToAchieve := time.Duration(ms) * time.Millisecond
 	for {

From 3df1bec576cbe180fe978b3f8c3acd425336bef7 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Tue, 26 Sep 2023 13:00:34 -0300
Subject: [PATCH 140/150] cuda: Also use GPU arch for intrinsic determination.

Newer CUDA Toolkit versions can still target older GPU architectures for
kernel compilation. Therefore, the target architecture also needs to be
taken into account when determining whether to use the intrinsic or not
for the ROTR macro of the kernel.

This adds a check to only use the intrinsic for architectures greater
than compute_30.
---
 blake3.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blake3.cu b/blake3.cu
index 5e43be3..25f5ca2 100644
--- a/blake3.cu
+++ b/blake3.cu
@@ -15,7 +15,7 @@
 #define MAX_OUTPUT_RESULTS 32
 
 // Written and optimized by Dave Collins Sep 2023.
-#if __CUDACC_VER_MAJOR__ >= 10
+#if (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ > 300)
 #define ROTR(v, n) __funnelshift_rc((v), (v), n)
 #else
 #define ROTR(v, n) ((v) >> n) | ((v) << (32 - n))

From 9b7fd18c4f9ffdf82ab9c5914a616c2956c53f68 Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Tue, 26 Sep 2023 13:02:53 -0300
Subject: [PATCH 141/150] cuda: Automatically determine GPU architecture.

When compiling the CUDA kernel, it is important to specify the GPU
architecture to use in order to use the best possible GPU features,
namely the intrinsic used for the ROTR macro.

Different CUDA Toolkit versions support different architecture
selectors, therefore this commit adds automatic GPU architecture
determination based on the installed Toolkit version. Versions greater
than 11.5 use the 'all' selector for architecture, which is the broadest
available. Older versions default to using the 'compute_50'
architecture, which provides support for the required features.

The automatic selection can be overridden by specifying an environment
variable GOMINER_CUDA_GPU_ARCH when generating the kernel with go
generate which allows testing different configurations.
---
 cuda_builder.go | 65 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/cuda_builder.go b/cuda_builder.go
index b910e81..0a391ab 100644
--- a/cuda_builder.go
+++ b/cuda_builder.go
@@ -42,7 +42,9 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"runtime"
+	"strconv"
 	"strings"
 )
 
@@ -205,17 +207,78 @@ func checkRequirementsDefault() error {
 	return os.MkdirAll("obj", 0o755)
 }
 
+// determineGPUArch determines the `--gpu-architecture` argument to use with
+// nvcc, based on the currently installed CUDA toolkit version.
+//
+// The GOMINER_CUDA_GPU_ARCH environment variable may be used to override
+// autodetection.
+func determineGPUArch() (string, error) {
+	if envVal := os.Getenv("GOMINER_CUDA_GPU_ARCH"); envVal != "" {
+		return envVal, nil
+	}
+
+	// defaultGPUArch is a sensible default architecture, which already
+	// includes the intrinsic used in the ROTR() macro of the CUDA kernel.
+	const defaultGPUArch = "compute_50"
+
+	// Run `nvcc --version` and base the decision on the toolkit version.
+	output, err := exec.Command("nvcc", "--version").Output()
+	if err != nil {
+		return "", fmt.Errorf("unable to run 'nvcc --version': %v", err)
+	}
+
+	re, err := regexp.Compile("(?mi:^Cuda compilation tools, release ([\\d]+)\\.([\\d]+))")
+	if err != nil {
+		return "", err
+	}
+	matches := re.FindStringSubmatch(string(output))
+	if len(matches) != 3 {
+		// nvcc --version failed to output the expected version string,
+		// so downgrade to the default arch.
+		return defaultGPUArch, nil
+	}
+
+	major, err := strconv.Atoi(matches[1])
+	if err != nil {
+		return "", err
+	}
+	minor, err := strconv.Atoi(matches[2])
+	if err != nil {
+		return "", err
+	}
+	if major > 11 || (major == 11 && minor >= 5) {
+		// Toolkit versions >= 11.5 have the "all" selector for
+		// --gpu-architecture, so use that as it's the broadest arch
+		// selector.
+		return "all", nil
+	}
+
+	// Otherwise, use the default.
+	return defaultGPUArch, nil
+}
+
 // buildBlake3Windows builds the blake3.dll library with a compiled version of
 // the Blake3 kernel.
 func buildBlake3Windows() error {
-	return runCmd("nvcc", "--shared", "--optimize=3", "--compiler-options=-GS-,-MD",
+	gpuArch, err := determineGPUArch()
+	if err != nil {
+		return err
+	}
+	return runCmd("nvcc", "--shared", "--optimize=3",
+		"--gpu-architecture="+gpuArch,
+		"--compiler-options=-GS-,-MD",
 		"-I.", "blake3.cu", "-o", "blake3.dll")
 }
 
 // buildBlake3Default builds the blake3.a library with a compiled version of the
 // Blake3 kernel.
 func buildBlake3Default() error {
+	gpuArch, err := determineGPUArch()
+	if err != nil {
+		return err
+	}
 	return runCmd("nvcc", "--lib", "--optimize=3",
+		"--gpu-architecture="+gpuArch,
 		"-I.", "blake3.cu", "-o", "obj/blake3.a")
 }
 

From 5bdfb8c475d961bdb67ee2fad5949a6dcafd6c6d Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Mon, 9 Oct 2023 07:13:47 -0500
Subject: [PATCH 142/150] Rework nonces to allow for stratum support.

Currently, both the first and second extra nonces are updated on new
work and iterations of the mining loop and the first extra nonce uses a
per-device byte to ensure unique work is being performed on each device.

However, proper stratum support requires using an extra nonce assigned
by the pool for the first nonce as well as respecting a provided length
for the second extra nonce.

This paves the way to be able to properly support those semantics by
reworking the nonce handling to avoid modifying the first extra nonce
during the mining loop, moving the per-device byte to the second extra
nonce, and rolling the second extra nonce during mining loop instead.
---
 cladldevice.go | 39 +++++++++++++++++++----------
 cldevice.go    | 39 +++++++++++++++++++----------
 cudevice.go    | 39 +++++++++++++++++++----------
 device.go      | 66 ++++++++++++++++++++++++++++++++------------------
 4 files changed, 121 insertions(+), 62 deletions(-)

diff --git a/cladldevice.go b/cladldevice.go
index 5fb8247..abbd319 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -111,13 +111,26 @@ type Device struct {
 
 	workSize uint32
 
-	// extraNonce is the device extraNonce, where the first
-	// byte is the device ID (supporting up to 255 devices)
-	// while the last 3 bytes is the extraNonce value. If
-	// the extraNonce goes through all 0x??FFFFFF values,
-	// it will reset to 0x??000000.
-	extraNonce    uint32
-	currentWorkID uint32
+	// extraNonce is an additional nonce that is used to separate groups of
+	// devices into exclusive ranges to ensure multiple groups do not duplicate
+	// work.
+	//
+	// For solo mining, it is unique per device.
+	//
+	// For pool mining, it is assigned by the pool on a per-connection basis and
+	// therefore is only unique per client.  Note that this means it will be the
+	// same for all devices with pool mining.
+	extraNonce uint32
+
+	// extraNonce2 is a per device additional nonce where the first byte is the
+	// device ID (offset by a per-process random value) and the last 3 bytes are
+	// dedicated to the search space.  Note that this means up to 256 devices
+	// are supported without the possibility of duplicate work.
+	//
+	// Since the first byte is unique per device, it does not change during
+	// operation which implies this value will rollover to 0x??000000 from
+	// 0x??ffffff.
+	extraNonce2 uint32
 
 	midstate  [8]uint32
 	lastBlock [16]uint32
@@ -480,9 +493,9 @@ func (d *Device) runDevice() error {
 		default:
 		}
 
-		// Increment extraNonce.
-		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = d.extraNonce
+		// Increment second extra nonce while respecting the device id.
+		util.RolloverExtraNonce(&d.extraNonce2)
+		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -557,15 +570,15 @@ func (d *Device) runDevice() error {
 
 		for i := uint32(0); i < outputData[0]; i++ {
 			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
-				"extraNonce %08x, workID %08x, timestamp %08x",
+				"extraNonce %08x, extraNonce2 %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				d.currentWorkID, d.lastBlock[work.TimestampWord])
+				d.lastBlock[work.Nonce2Word], d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
 			// difficulty 1 shares.
 			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
-				d.lastBlock[work.Nonce1Word])
+				d.lastBlock[work.Nonce1Word], d.lastBlock[work.Nonce2Word])
 		}
 
 		elapsedTime := time.Since(currentTime)
diff --git a/cldevice.go b/cldevice.go
index 467b7ba..f546107 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -224,13 +224,26 @@ type Device struct {
 
 	workSize uint32
 
-	// extraNonce is the device extraNonce, where the first
-	// byte is the device ID (supporting up to 255 devices)
-	// while the last 3 bytes is the extraNonce value. If
-	// the extraNonce goes through all 0x??FFFFFF values,
-	// it will reset to 0x??000000.
-	extraNonce    uint32
-	currentWorkID uint32
+	// extraNonce is an additional nonce that is used to separate groups of
+	// devices into exclusive ranges to ensure multiple groups do not duplicate
+	// work.
+	//
+	// For solo mining, it is unique per device.
+	//
+	// For pool mining, it is assigned by the pool on a per-connection basis and
+	// therefore is only unique per client.  Note that this means it will be the
+	// same for all devices with pool mining.
+	extraNonce uint32
+
+	// extraNonce2 is a per device additional nonce where the first byte is the
+	// device ID (offset by a per-process random value) and the last 3 bytes are
+	// dedicated to the search space.  Note that this means up to 256 devices
+	// are supported without the possibility of duplicate work.
+	//
+	// Since the first byte is unique per device, it does not change during
+	// operation which implies this value will rollover to 0x??000000 from
+	// 0x??ffffff.
+	extraNonce2 uint32
 
 	midstate  [8]uint32
 	lastBlock [16]uint32
@@ -607,9 +620,9 @@ func (d *Device) runDevice(ctx context.Context) error {
 		default:
 		}
 
-		// Increment extraNonce.
-		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = d.extraNonce
+		// Increment second extra nonce while respecting the device id.
+		util.RolloverExtraNonce(&d.extraNonce2)
+		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -684,15 +697,15 @@ func (d *Device) runDevice(ctx context.Context) error {
 
 		for i := uint32(0); i < outputData[0]; i++ {
 			minrLog.Debugf("DEV #%d: Found candidate %v nonce %08x, "+
-				"extraNonce %08x, workID %08x, timestamp %08x",
+				"extraNonce %08x, extraNonce2 %08x, timestamp %08x",
 				d.index, i+1, outputData[i+1], d.lastBlock[work.Nonce1Word],
-				d.currentWorkID, d.lastBlock[work.TimestampWord])
+				d.lastBlock[work.Nonce2Word], d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
 			// difficulty 1 shares.
 			d.foundCandidate(d.lastBlock[work.TimestampWord], outputData[i+1],
-				d.lastBlock[work.Nonce1Word])
+				d.lastBlock[work.Nonce1Word], d.lastBlock[work.Nonce2Word])
 		}
 
 		elapsedTime := time.Since(currentTime)
diff --git a/cudevice.go b/cudevice.go
index 1383ab7..040a4a3 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -71,13 +71,26 @@ type Device struct {
 	cuThreadCount uint32
 	cuGridSize    uint32
 
-	// extraNonce is the device extraNonce, where the first
-	// byte is the device ID (supporting up to 255 devices)
-	// while the last 3 bytes is the extraNonce value. If
-	// the extraNonce goes through all 0x??FFFFFF values,
-	// it will reset to 0x??000000.
-	extraNonce    uint32
-	currentWorkID uint32
+	// extraNonce is an additional nonce that is used to separate groups of
+	// devices into exclusive ranges to ensure multiple groups do not duplicate
+	// work.
+	//
+	// For solo mining, it is unique per device.
+	//
+	// For pool mining, it is assigned by the pool on a per-connection basis and
+	// therefore is only unique per client.  Note that this means it will be the
+	// same for all devices with pool mining.
+	extraNonce uint32
+
+	// extraNonce2 is a per device additional nonce where the first byte is the
+	// device ID (offset by a per-process random value) and the last 3 bytes are
+	// dedicated to the search space.  Note that this means up to 256 devices
+	// are supported without the possibility of duplicate work.
+	//
+	// Since the first byte is unique per device, it does not change during
+	// operation which implies this value will rollover to 0x??000000 from
+	// 0x??ffffff.
+	extraNonce2 uint32
 
 	midstate  [8]uint32
 	lastBlock [16]uint32
@@ -377,9 +390,9 @@ func (d *Device) runDevice(ctx context.Context) error {
 		default:
 		}
 
-		// Increment extraNonce.
-		util.RolloverExtraNonce(&d.extraNonce)
-		d.lastBlock[work.Nonce1Word] = d.extraNonce
+		// Increment second extra nonce while respecting the device id.
+		util.RolloverExtraNonce(&d.extraNonce2)
+		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
 		// Update the timestamp. Only solo work allows you to roll
 		// the timestamp.
@@ -409,15 +422,15 @@ func (d *Device) runDevice(ctx context.Context) error {
 		numResults := nonceResultsHSlice[0]
 		for i, result := range nonceResultsHSlice[1 : 1+numResults] {
 			minrLog.Debugf("GPU #%d: Found candidate %v nonce %08x, "+
-				"extraNonce %08x, workID %08x, timestamp %08x",
+				"extraNonce %08x, extraNonce2 %08x, timestamp %08x",
 				d.index, i, result, d.lastBlock[work.Nonce1Word],
-				d.currentWorkID, d.lastBlock[work.TimestampWord])
+				d.lastBlock[work.Nonce2Word], d.lastBlock[work.TimestampWord])
 
 			// Assess the work. If it's below target, it'll be rejected
 			// here. The mining algorithm currently sends this function any
 			// difficulty 1 shares.
 			d.foundCandidate(d.lastBlock[work.TimestampWord], result,
-				d.lastBlock[work.Nonce1Word])
+				d.lastBlock[work.Nonce1Word], d.lastBlock[work.Nonce2Word])
 		}
 
 		elapsedTime := time.Since(currentTime)
diff --git a/device.go b/device.go
index 959662f..cb3f629 100644
--- a/device.go
+++ b/device.go
@@ -73,27 +73,31 @@ func (d *Device) initNonces() error {
 	extraNonceRandOffset := binary.LittleEndian.Uint32(buf[0:])
 	extraNonce2RandOffset := binary.LittleEndian.Uint32(buf[4:])
 
-	// Set the initial extra nonce as follows:
+	// Set the extra nonce to a random value.  This value is unique per device
+	// when solo mining.  The extra nonce is assigned by the pool instead when
+	// pool mining.
+	//
+	// When combined with the device ID in the second extra nonce below, this
+	// helps prevent collisions across multiple processes and systems working on
+	// the same template.
+	d.extraNonce = extraNonceRandOffset
+
+	// Set the initial second extra nonce as follows:
 	// - The first byte is the device ID offset by the first per-process random
 	//   device offset
 	// - The remaining 3 bytes are a per-device random extra nonce offset
 	//
-	// This, when coupled with the second per-process random device offset set
-	// elsewhere, ensures each device in the same system is doing different work
-	// (up to 65536 devices) while also helping prevent collisions across
-	// multiple processes and systems working on the same template.
-	deviceOffset := (uint32(d.index) + uint32(randDeviceOffset1)) % 255
-	d.extraNonce = deviceOffset<<24 | extraNonceRandOffset&0x00ffffff
-
-	// Set the current work ID to a random initial value.
+	// This ensures each device in the same system is doing different work (up
+	// to 256 devices).
 	//
-	// The current work ID is also treated as a secondary extra nonce and thus,
-	// when combined with the extra nonce above, the result is that the pair
-	// effectively acts as an 8-byte randomized extra nonce.
-	d.currentWorkID = extraNonce2RandOffset
-
-	minrLog.Debugf("DEV #%d: initial extraNonce %x, initial workID: %x",
-		d.index, d.extraNonce, d.currentWorkID)
+	// This implies that the total search space is 7 bytes when combining the 3
+	// bytes provided by this value along with the normal 4-byte nonce.  In
+	// other words, it supports devices up to ~72 Ph/s.
+	deviceOffset := (uint32(d.index) + uint32(randDeviceOffset1)) % 256
+	d.extraNonce2 = deviceOffset<<24 | extraNonce2RandOffset&0x00ffffff
+
+	minrLog.Debugf("DEV #%d: initial extraNonce %x, initial extraNonce2: %x",
+		d.index, d.extraNonce, d.extraNonce2)
 	return nil
 }
 
@@ -121,15 +125,30 @@ func (d *Device) updateCurrentWork(ctx context.Context) {
 	d.work = *w
 	minrLog.Tracef("pre-nonce: %x", d.work.Data[:])
 
-	// Bump and set the work ID.
-	d.currentWorkID++
+	// Ensure the work data is updated with the extra nonce associated with the
+	// device for solo mining.
+	//
+	// The extra nonce is provided by the pool when pool mining, so there is no
+	// need to update it in that case.
+	const en1Offset = 128 + 4*work.Nonce1Word
+	if d.work.IsGetWork {
+		binary.LittleEndian.PutUint32(d.work.Data[en1Offset:], d.extraNonce)
+	}
+
+	// Ensure the work data is updated with the second extra nonce associated
+	// with the device.
 	binary.LittleEndian.PutUint32(d.work.Data[128+4*work.Nonce2Word:],
-		d.currentWorkID)
+		d.extraNonce2)
 
 	// Set additional byte with the device id offset by a second per-process
-	// random device offset to support up to 65536 devices.
-	deviceID := uint8((uint32(d.index) + uint32(randDeviceOffset2)) % 255)
-	d.work.Data[128+4*work.Nonce3Word] = deviceID
+	// random device offset to support up to 65536 devices with getwork (solo)
+	// mining.  Pool mining does not support the additional byte, so it is not
+	// needed in that case.  Note that this also means pool mining only supports
+	// 256 devices per client (aka process instance).
+	if d.work.IsGetWork {
+		deviceID := uint8((uint32(d.index) + uint32(randDeviceOffset2)) % 256)
+		d.work.Data[128+4*work.Nonce3Word] = deviceID
+	}
 
 	// Hash the two first blocks.
 	d.midstate = blake3.Block(blake3.IV, d.work.Data[0:64], blake3.FlagChunkStart)
@@ -316,7 +335,7 @@ func (d *Device) fanControlSupported(kind string) bool {
 	return false
 }
 
-func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
+func (d *Device) foundCandidate(ts, nonce0, nonce1, nonce2 uint32) {
 	d.Lock()
 	defer d.Unlock()
 	// Construct the final block header.
@@ -326,6 +345,7 @@ func (d *Device) foundCandidate(ts, nonce0, nonce1 uint32) {
 	binary.LittleEndian.PutUint32(data[128+4*work.TimestampWord:], ts)
 	binary.LittleEndian.PutUint32(data[128+4*work.Nonce0Word:], nonce0)
 	binary.LittleEndian.PutUint32(data[128+4*work.Nonce1Word:], nonce1)
+	binary.LittleEndian.PutUint32(data[128+4*work.Nonce2Word:], nonce2)
 	hash := chainhash.Hash(blake3.FinalBlock(d.midstate, data[128:180]))
 
 	// Hashes that reach this logic and fail the minimal proof of

From 2e7c33a9f56a2602f3ae5682b72424cead395138 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Tue, 10 Oct 2023 13:44:18 -0500
Subject: [PATCH 143/150] Correct several stratum/pooled mining issues.

This updates the code to properly support pooled mining via stratum.

In particular:

- The extra nonce provided by the pool is now correctly set and used
- The length for the second extra nonce provided by the pool is now
  validated
- The second extra nonce length now respects the provided length
- The mining code now uses the correct offset within the serialized work
  data for the second extra nonce to ensure the pool properly
  reconstructs the header
- The provided timestamp is now updated locally as the mining process is
  underway and the final timestamp is submitted along with the share as
  expected
- The correct network parameters for the active network are now passed
  into the stratum code so the right difficulties are used
- The stratum fields that represent numbers are now consistently in
  little endian per the stratum "spec" (such that it is)
- The "second generation tx" field is now ignored because it does not
  apply to Decred

Finally, various loggging messages have been cleaned up and the job id
is no longer incorrectly expected to be numeric when logging it.
---
 cladldevice.go     |  10 +-
 cldevice.go        |  10 +-
 cudevice.go        |  10 +-
 getwork.go         |  18 ++--
 miner.go           |   3 +-
 stratum/stratum.go | 241 +++++++++++++++++++--------------------------
 6 files changed, 125 insertions(+), 167 deletions(-)

diff --git a/cladldevice.go b/cladldevice.go
index abbd319..c842f7c 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -497,13 +497,9 @@ func (d *Device) runDevice() error {
 		util.RolloverExtraNonce(&d.extraNonce2)
 		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
-		// Update the timestamp. Only solo work allows you to roll
-		// the timestamp.
-		ts := d.work.JobTime
-		if d.work.IsGetWork {
-			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
-			ts = d.work.JobTime + diffSeconds
-		}
+		// Update the timestamp.
+		diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+		ts += d.work.JobTime + diffSeconds
 		d.lastBlock[work.TimestampWord] = ts
 
 		// arg 0: pointer to the buffer
diff --git a/cldevice.go b/cldevice.go
index f546107..3d0e5a2 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -624,13 +624,9 @@ func (d *Device) runDevice(ctx context.Context) error {
 		util.RolloverExtraNonce(&d.extraNonce2)
 		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
-		// Update the timestamp. Only solo work allows you to roll
-		// the timestamp.
-		ts := d.work.JobTime
-		if d.work.IsGetWork {
-			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
-			ts = d.work.JobTime + diffSeconds
-		}
+		// Update the timestamp.
+		diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+		ts := d.work.JobTime + diffSeconds
 		d.lastBlock[work.TimestampWord] = ts
 
 		// arg 0: pointer to the buffer
diff --git a/cudevice.go b/cudevice.go
index 040a4a3..0204020 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -394,13 +394,9 @@ func (d *Device) runDevice(ctx context.Context) error {
 		util.RolloverExtraNonce(&d.extraNonce2)
 		d.lastBlock[work.Nonce2Word] = d.extraNonce2
 
-		// Update the timestamp. Only solo work allows you to roll
-		// the timestamp.
-		ts := d.work.JobTime
-		if d.work.IsGetWork {
-			diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
-			ts = d.work.JobTime + diffSeconds
-		}
+		// Update the timestamp.
+		diffSeconds := uint32(time.Now().Unix()) - d.work.TimeReceived
+		ts := d.work.JobTime + diffSeconds
 		d.lastBlock[work.TimestampWord] = ts
 
 		// Clear the results buffer.
diff --git a/getwork.go b/getwork.go
index 52725e4..39f6990 100644
--- a/getwork.go
+++ b/getwork.go
@@ -15,7 +15,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"strconv"
 	"time"
 
 	"github.com/decred/go-socks/socks"
@@ -165,14 +164,17 @@ func GetWork() (*work.Work, error) {
 			len(target))
 	}
 
-	bigTarget := new(big.Int)
-	bigTarget.SetBytes(util.Reverse(target))
+	// The bigTarget difficulty is provided in little endian, but big integers
+	// expect big endian, so reverse it accordingly.
+	bigTarget := new(big.Int).SetBytes(util.Reverse(target))
 
 	var workData [192]byte
 	copy(workData[:], data)
-	givenTs := binary.LittleEndian.Uint32(
-		workData[128+4*work.TimestampWord : 132+4*work.TimestampWord])
-	w := work.NewWork(workData, bigTarget, givenTs, uint32(time.Now().Unix()), true)
+
+	const isGetWork = true
+	timestamp := binary.LittleEndian.Uint32(workData[128+4*work.TimestampWord:])
+	w := work.NewWork(workData, bigTarget, timestamp, uint32(time.Now().Unix()),
+		isGetWork)
 
 	w.Target = bigTarget
 
@@ -196,8 +198,8 @@ func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 			return nil, err
 		}
 
-		intJob, _ := strconv.ParseInt(pool.PoolWork.JobID, 16, 0)
-		poolLog.Debugf("new job %v height %v", intJob, pool.PoolWork.Height)
+		poolLog.Debugf("new job %q height %v", pool.PoolWork.JobID,
+			pool.PoolWork.Height)
 
 		return pool.PoolWork.Work, nil
 	}
diff --git a/miner.go b/miner.go
index 0f55788..d190fc6 100644
--- a/miner.go
+++ b/miner.go
@@ -38,7 +38,8 @@ func NewMiner() (*Miner, error) {
 
 	// If needed, start pool code.
 	if cfg.Pool != "" && !cfg.Benchmark {
-		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version())
+		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword,
+			cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version(), chainParams)
 		if err != nil {
 			return nil, err
 		}
diff --git a/stratum/stratum.go b/stratum/stratum.go
index db38de9..eb71588 100644
--- a/stratum/stratum.go
+++ b/stratum/stratum.go
@@ -5,7 +5,6 @@ package stratum
 import (
 	"bufio"
 	"bytes"
-	"crypto/rand"
 	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
@@ -25,14 +24,13 @@ import (
 
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/chaincfg/v3"
+	"github.com/decred/dcrd/crypto/blake256"
 	"github.com/decred/dcrd/wire"
 	"github.com/decred/go-socks/socks"
 	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
-var chainParams = chaincfg.MainNetParams()
-
 // ErrStratumStaleWork indicates that the work to send to the pool was stale.
 var ErrStratumStaleWork = errors.New("stale work, throwing away")
 
@@ -42,7 +40,6 @@ type Stratum struct {
 	// The following variables must only be used atomically.
 	ValidShares   uint64
 	InvalidShares uint64
-	latestJobTime uint32
 
 	sync.Mutex
 	cfg       Config
@@ -61,6 +58,7 @@ type Stratum struct {
 
 // Config holdes the config options that may be used by a stratum pool.
 type Config struct {
+	Params    *chaincfg.Params
 	Pool      string
 	User      string
 	Pass      string
@@ -79,7 +77,6 @@ type NotifyWork struct {
 	ExtraNonce2Length float64
 	Nonce2            uint32
 	CB1               string
-	CB2               string
 	Height            int64
 	NtimeDelta        int64
 	JobID             string
@@ -174,8 +171,11 @@ func sliceRemove(s []uint64, e uint64) []uint64 {
 
 // StratumConn starts the initial connection to a stratum pool and sets defaults
 // in the pool object.
-func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string) (*Stratum, error) {
+func StratumConn(pool, user, pass, proxy, proxyUser, proxyPass, version string,
+	chainParams *chaincfg.Params) (*Stratum, error) {
+
 	var stratum Stratum
+	stratum.cfg.Params = chainParams
 	stratum.cfg.User = user
 	stratum.cfg.Pass = pass
 	stratum.cfg.Proxy = proxy
@@ -411,7 +411,6 @@ func (s *Stratum) handleNotifyRes(resp interface{}) {
 	}
 
 	s.PoolWork.Height = height
-	s.PoolWork.CB2 = nResp.GenTX2
 	s.PoolWork.Hash = nResp.Hash
 	s.PoolWork.Nbits = nResp.Nbits
 	s.PoolWork.Version = nResp.BlockVersion
@@ -699,13 +698,8 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 			return nil, errJsonType
 		}
 		nres.GenTX1 = genTX1
-		genTX2, ok := resi[3].(string)
-		if !ok {
-			return nil, errJsonType
-		}
-		nres.GenTX2 = genTX2
-		//ccminer code also confirms this
-		//nres.MerkleBranches = resi[4].([]string)
+		// Ignore the GenTX2 (param 3) and Merkle Branches (param 4) fields
+		// since they do not apply to Decred.
 		blockVersion, ok := resi[5].(string)
 		if !ok {
 			return nil, errJsonType
@@ -740,7 +734,7 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 		if !ok {
 			return nil, errJsonType
 		}
-		s.Target, err = util.DiffToTarget(difficulty, chainParams.PowLimit)
+		s.Target, err = util.DiffToTarget(difficulty, s.cfg.Params.PowLimit)
 		if err != nil {
 			return nil, err
 		}
@@ -833,108 +827,101 @@ func (s *Stratum) Unmarshal(blob []byte) (interface{}, error) {
 
 // PrepWork converts the stratum notify to getwork style data for mining.
 func (s *Stratum) PrepWork() error {
-	// Build final extranonce, which is basically the pool user and worker
-	// ID.
-	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
-	if err != nil {
-		log.Error("Error decoding ExtraNonce1.")
-		return err
-	}
-
-	// Work out padding.
-	tmp := []string{"%0", strconv.Itoa(int(s.PoolWork.ExtraNonce2Length) * 2), "x"}
-	fmtString := strings.Join(tmp, "")
-	en2, err := hex.DecodeString(fmt.Sprintf(fmtString, s.PoolWork.ExtraNonce2))
+	// Decode the previous block hash.  Stratum should provide the hash in
+	// internal byte order (meaning the exact order produced by the hash
+	// function).
+	//
+	// Note that this is reversed from how it typically appears when presented
+	// to humans in places such as block explorers which treat them as little
+	// endian uint256s.
+	prevHashBytes, err := hex.DecodeString(s.PoolWork.Hash)
 	if err != nil {
-		log.Error("Error decoding ExtraNonce2.")
-		return err
+		return fmt.Errorf("error decoding previous block hash: %w", err)
 	}
-	extraNonce := append(en1[:], en2[:]...)
 
-	// Put coinbase transaction together.
-	cb1, err := hex.DecodeString(s.PoolWork.CB1)
+	// Decode extranonce1 which is basically the pool user and worker ID.
+	const maxExtraNonce1Len = 4
+	en1, err := hex.DecodeString(s.PoolWork.ExtraNonce1)
 	if err != nil {
-		log.Error("Error decoding Coinbase pt 1.")
-		return err
+		return fmt.Errorf("error decoding ExtraNonce1 field: %w", err)
 	}
-	cb2, err := hex.DecodeString(s.PoolWork.CB2)
-	if err != nil {
-		log.Errorf("Error decoding Coinbase pt 2.")
-		return err
+	if len(en1) > maxExtraNonce1Len {
+		return fmt.Errorf("extraNonce1 length must be a max of %d bytes",
+			maxExtraNonce1Len)
 	}
 
-	// Generate current ntime.
-	ntime := time.Now().Unix() + s.PoolWork.NtimeDelta
-
-	log.Tracef("ntime: %x", ntime)
-
-	// Serialize header.
-	bh := wire.BlockHeader{}
-	v, err := hex.DecodeString(s.PoolWork.Version)
-	if err != nil {
-		return err
+	// Require a minimum of 4 bytes and a maximum of 12 bytes for the length of
+	// extraNonce2.
+	const requiredExtraNonce2Len = 4
+	const maxExtraNonce2Len = 12
+	if s.PoolWork.ExtraNonce2Length < requiredExtraNonce2Len {
+		return fmt.Errorf("extraNonce2 length must be at least %d bytes",
+			requiredExtraNonce2Len)
 	}
-	bh.Version = int32(binary.LittleEndian.Uint32(v))
-
-	nbits, err := hex.DecodeString(s.PoolWork.Nbits)
-	if err != nil {
-		log.Error("Error decoding nbits")
-		return err
+	if s.PoolWork.ExtraNonce2Length > maxExtraNonce2Len {
+		return fmt.Errorf("extraNonce2 length must be a max of %d bytes",
+			maxExtraNonce2Len)
 	}
 
-	b, _ := binary.Uvarint(nbits)
-	bh.Bits = uint32(b)
-	t := time.Now().Unix() + s.PoolWork.NtimeDelta
-	bh.Timestamp = time.Unix(t, 0)
-	bh.Nonce = 0
-
-	// Serialized version.
-	blockHeader, err := bh.Bytes()
-	if err != nil {
-		return err
-	}
-
-	data := blockHeader
-	copy(data[31:139], cb1[0:108])
-
-	var workdata [180]byte
-	workPosition := 0
-
-	version := new(bytes.Buffer)
-	err = binary.Write(version, binary.LittleEndian, v)
+	// The stratum "protocol" (which is not actually very well defined) does not
+	// individually provide all of the information the Decred header needs nor
+	// does it provide an official way to extend it.
+	//
+	// Further, the Decred header explicitly provides additional space which
+	// removes the need to create a new coinbase and update the merkle root.
+	//
+	// In order to address these things, the field that was intended to serve
+	// for the coinbase (generate transaction) is instead repurposed to contain
+	// the serialized partial header for everything after the previous block
+	// hash in the format it is to be hashed.
+	partialHeader, err := hex.DecodeString(s.PoolWork.CB1)
 	if err != nil {
-		return err
+		return fmt.Errorf("error decoding cb1 field (partial header): %w", err)
 	}
-	copy(workdata[workPosition:], version.Bytes())
 
-	prevHash := util.RevHash(s.PoolWork.Hash)
-	p, err := hex.DecodeString(prevHash)
+	// Decode block version.
+	//
+	// The block version must be in little endian in the serialized header and
+	// stratum should provide all fields in little endian.  Thus, no conversion
+	// is needed.
+	blockVerBytes, err := hex.DecodeString(s.PoolWork.Version)
 	if err != nil {
-		log.Error("Error encoding previous hash.")
-		return err
+		return fmt.Errorf("error decoding version field: %w", err)
 	}
 
-	workPosition += 4
-	copy(workdata[workPosition:], p)
-	workPosition += 32
-	copy(workdata[workPosition:], cb1[0:108])
-	workPosition += 108
-	copy(workdata[workPosition:], extraNonce)
-	workPosition = 176
-	copy(workdata[workPosition:], cb2)
-
-	var randomBytes = make([]byte, 4)
-	_, err = rand.Read(randomBytes)
+	// Decode timestamp.
+	//
+	// Stratum should provide all fields in little endian.
+	timestampBytes, err := hex.DecodeString(s.PoolWork.Ntime)
 	if err != nil {
-		log.Errorf("Unable to generate random bytes")
-		return err
+		return fmt.Errorf("error decoding timestamp field: %w", err)
 	}
+	timestamp := binary.LittleEndian.Uint32(timestampBytes)
 
+	// Assemble work with provided details.
+	//
+	// The getwork data format consists of the serialized block header followed
+	// by the additional blake3 padding needed to bring the data length to a
+	// multiple of the blake3 block size.  Since the blake3 block size is 64
+	// bytes and the header is 180 bytes, the next multiple is 192 bytes.
+	//
+	// See the comment on the partial header above for the rationale here.
+	//
+	// Note that the timestamp is not set in the work data here because it is
+	// passed along to the mining process separately where it is updated and set
+	// accordingly as work is performed.  It is also worth noting that the pool
+	// providing the work should typically have already set the provided
+	// timestamp in the provided partial header too, but this implementation
+	// does not rely on that assumption.
 	var workData [192]byte
-	copy(workData[:], workdata[:])
-	givenTs := binary.LittleEndian.Uint32(
-		workData[128+4*work.TimestampWord : 132+4*work.TimestampWord])
-	atomic.StoreUint32(&s.latestJobTime, givenTs)
+	offset := 0
+	offset += copy(workData[offset:], blockVerBytes)
+	offset += copy(workData[offset:], prevHashBytes)
+	copy(workData[offset:], partialHeader)
+
+	// Set the provided extra nonce at the expected offset.  The protocol
+	// expects it to be in the serialized header just after the 4-byte nonce.
+	copy(workData[144:], en1)
 
 	if s.Target == nil {
 		log.Errorf("No target set!  Reconnecting to pool.")
@@ -949,67 +936,47 @@ func (s *Stratum) PrepWork() error {
 		return nil
 	}
 
-	w := work.NewWork(workData, s.Target, givenTs, uint32(time.Now().Unix()), false)
-
-	log.Tracef("Stratum prepated work data %x, target %032x", w.Data[:],
-		w.Target.Bytes())
+	const isGetWork = false
+	receivedTime := uint32(time.Now().Unix())
+	w := work.NewWork(workData, s.Target, timestamp, receivedTime, isGetWork)
 	s.PoolWork.Work = w
+	log.Tracef("Stratum prepared work data %x, target %064x", w.Data[:],
+		w.Target)
 
 	return nil
 }
 
 // PrepSubmit formats a mining.sumbit message from the solved work.
 func (s *Stratum) PrepSubmit(data []byte) (Submit, error) {
-	log.Debugf("Stratum got valid work to submit %x", data)
-	log.Debugf("Stratum got valid work hash %v",
-		chainhash.HashH(data[0:180]))
-	data2 := make([]byte, 180)
-	copy(data2, data[0:180])
+	headerBytes := data[0:wire.MaxBlockHeaderPayload]
+	log.Debugf("Stratum got valid work to submit %x (block hash %v)", data,
+		chainhash.Hash(blake256.Sum256(headerBytes)))
 
 	sub := Submit{}
 	sub.Method = "mining.submit"
 
-	// Format data to send off.
-	hexData := hex.EncodeToString(data)
-	decodedData, err := hex.DecodeString(hexData)
-	if err != nil {
-		log.Error("Error decoding data")
-		return sub, err
-	}
-
 	var submittedHeader wire.BlockHeader
-	bhBuf := bytes.NewReader(decodedData[0:wire.MaxBlockHeaderPayload])
-	err = submittedHeader.Deserialize(bhBuf)
+	err := submittedHeader.Deserialize(bytes.NewReader(headerBytes))
 	if err != nil {
 		log.Error("Error generating header")
 		return sub, err
 	}
 
-	latestWorkTs := atomic.LoadUint32(&s.latestJobTime)
-	if uint32(submittedHeader.Timestamp.Unix()) != latestWorkTs {
-		return sub, ErrStratumStaleWork
-	}
-
 	s.ID++
 	sub.ID = s.ID
 	s.submitIDs = append(s.submitIDs, s.ID)
 
-	// The timestamp string should be:
-	//
-	//   timestampStr := fmt.Sprintf("%08x",
-	//     uint32(submittedHeader.Timestamp.Unix()))
-	//
-	// but the "stratum" protocol appears to only use this value
-	// to check if the miner is in sync with the latest announcement
-	// of work from the pool. If this value is anything other than
-	// the timestamp of the latest pool work timestamp, work gets
-	// rejected from the current implementation.
-	timestampStr := fmt.Sprintf("%08x", latestWorkTs)
-	nonceStr := fmt.Sprintf("%08x", submittedHeader.Nonce)
-	xnonceStr := hex.EncodeToString(data[144:156])
-
-	sub.Params = []string{s.cfg.User, s.PoolWork.JobID, xnonceStr, timestampStr,
-		nonceStr}
+	// The fields in the serialized header must be in little endian and stratum
+	// fields for numeric values should be in little endian.  Thus, no
+	// endianness conversion is needed.
+	uint32LEHex := func(startOffset int) string {
+		return hex.EncodeToString(data[startOffset : startOffset+4])
+	}
+	timestampHex := uint32LEHex(128 + 4*work.TimestampWord)
+	nonceHex := uint32LEHex(128 + 4*work.Nonce0Word)
+	extraNonce2Hex := uint32LEHex(128 + 4*work.Nonce2Word)
+	sub.Params = []string{s.cfg.User, s.PoolWork.JobID, extraNonce2Hex,
+		timestampHex, nonceHex}
 
 	return sub, nil
 }

From c520a0a9871195729b1e6468ef7695f8eb1640de Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Thu, 12 Oct 2023 13:36:35 -0400
Subject: [PATCH 144/150] pass required channel to newMinerDevs and return
 devices

---
 cladldevice.go | 17 +++++++++--------
 cldevice.go    | 17 +++++++++--------
 cudevice.go    | 15 ++++++++-------
 miner.go       |  5 +++--
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/cladldevice.go b/cladldevice.go
index c842f7c..f28d123 100644
--- a/cladldevice.go
+++ b/cladldevice.go
@@ -583,20 +583,21 @@ func (d *Device) runDevice() error {
 	}
 }
 
-func newMinerDevs(m *Miner) (*Miner, int, error) {
+func newMinerDevs(workDone chan []byte) ([]*Device, error) {
 	deviceListIndex := 0
 	deviceListEnabledCount := 0
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("could not get CL platforms: %w", err)
+		return nil, fmt.Errorf("could not get CL platforms: %w", err)
 	}
 
+	var devices []*Device
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("could not get CL devices for platform: %w", err)
+			return nil, fmt.Errorf("could not get CL devices for platform: %w", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
@@ -613,17 +614,17 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 				miningAllowed = true
 			}
 			if miningAllowed {
-				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
-				deviceListEnabledCount++
-				m.devices = append(m.devices, newDevice)
+				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, workDone)
 				if err != nil {
-					return nil, 0, err
+					return nil, err
 				}
+				devices = append(devices, newDevice)
+				deviceListEnabledCount++
 			}
 			deviceListIndex++
 		}
 	}
-	return m, deviceListEnabledCount, nil
+	return devices, nil
 }
 
 func getDeviceInfo(id cl.CL_device_id,
diff --git a/cldevice.go b/cldevice.go
index 3d0e5a2..68b60e8 100644
--- a/cldevice.go
+++ b/cldevice.go
@@ -710,20 +710,21 @@ func (d *Device) runDevice(ctx context.Context) error {
 	}
 }
 
-func newMinerDevs(m *Miner) (*Miner, int, error) {
+func newMinerDevs(workDone chan []byte) ([]*Device, error) {
 	deviceListIndex := 0
 	deviceListEnabledCount := 0
 
 	platformIDs, err := getCLPlatforms()
 	if err != nil {
-		return nil, 0, fmt.Errorf("could not get CL platforms: %w", err)
+		return nil, fmt.Errorf("could not get CL platforms: %w", err)
 	}
 
+	var devices []*Device
 	for p := range platformIDs {
 		platformID := platformIDs[p]
 		CLdeviceIDs, err := getCLDevices(platformID)
 		if err != nil {
-			return nil, 0, fmt.Errorf("could not get CL devices for platform: %w", err)
+			return nil, fmt.Errorf("could not get CL devices for platform: %w", err)
 		}
 
 		for _, CLdeviceID := range CLdeviceIDs {
@@ -740,17 +741,17 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 				miningAllowed = true
 			}
 			if miningAllowed {
-				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, m.workDone)
-				deviceListEnabledCount++
-				m.devices = append(m.devices, newDevice)
+				newDevice, err := NewDevice(deviceListIndex, deviceListEnabledCount, platformID, CLdeviceID, workDone)
 				if err != nil {
-					return nil, 0, err
+					return nil, err
 				}
+				devices = append(devices, newDevice)
+				deviceListEnabledCount++
 			}
 			deviceListIndex++
 		}
 	}
-	return m, deviceListEnabledCount, nil
+	return devices, nil
 }
 
 func getDeviceInfo(id cl.CL_device_id,
diff --git a/cudevice.go b/cudevice.go
index 0204020..8964390 100644
--- a/cudevice.go
+++ b/cudevice.go
@@ -546,18 +546,19 @@ func (d *Device) calcGridSizeForMilliseconds(ms int, threadCount uint32) (uint32
 
 	return gridSize, nil
 }
-func newMinerDevs(m *Miner) (*Miner, int, error) {
+func newMinerDevs(workDone chan []byte) ([]*Device, error) {
 	deviceListIndex := 0
 	deviceListEnabledCount := 0
 
 	CUdeviceIDs, err := getInfo()
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 
 	// XXX Can probably combine these bits with the opencl ones once
 	// I decide what to do about the types.
 
+	var devices []*Device
 	for _, CUDeviceID := range CUdeviceIDs {
 		miningAllowed := false
 
@@ -573,17 +574,17 @@ func newMinerDevs(m *Miner) (*Miner, int, error) {
 		}
 
 		if miningAllowed {
-			newDevice, err := NewCuDevice(deviceListIndex, deviceListEnabledCount, CUDeviceID, m.workDone)
-			deviceListEnabledCount++
-			m.devices = append(m.devices, newDevice)
+			newDevice, err := NewCuDevice(deviceListIndex, deviceListEnabledCount, CUDeviceID, workDone)
 			if err != nil {
-				return nil, 0, err
+				return nil, err
 			}
+			devices = append(devices, newDevice)
+			deviceListEnabledCount++
 		}
 		deviceListIndex++
 	}
 
-	return m, deviceListEnabledCount, nil
+	return devices, nil
 }
 
 func (d *Device) Release() {
diff --git a/miner.go b/miner.go
index d190fc6..2c7aead 100644
--- a/miner.go
+++ b/miner.go
@@ -46,15 +46,16 @@ func NewMiner() (*Miner, error) {
 		m.pool = s
 	}
 
-	m, deviceListEnabledCount, err := newMinerDevs(m)
+	devices, err := newMinerDevs(m.workDone)
 	if err != nil {
 		return nil, err
 	}
 
-	if deviceListEnabledCount == 0 {
+	if len(devices) == 0 {
 		return nil, fmt.Errorf("no devices started")
 	}
 
+	m.devices = devices
 	m.started = uint32(time.Now().Unix())
 
 	return m, nil

From ec58d6b892b44a32718d96bd5dd3752cb0312c4a Mon Sep 17 00:00:00 2001
From: David Hill <dhill@mindcry.org>
Date: Tue, 10 Oct 2023 15:40:21 -0400
Subject: [PATCH 145/150] Use rpcclient w/ ntfns instead of polling.

---
 getwork.go | 230 -----------------------------------------------------
 go.mod     |   9 ++-
 go.sum     |  16 ++++
 main.go    |   7 +-
 miner.go   | 164 +++++++++++++++++++++++++++-----------
 5 files changed, 144 insertions(+), 282 deletions(-)

diff --git a/getwork.go b/getwork.go
index 39f6990..67678d3 100644
--- a/getwork.go
+++ b/getwork.go
@@ -3,184 +3,13 @@
 package main
 
 import (
-	"bytes"
-	"crypto/tls"
-	"crypto/x509"
-	"encoding/binary"
-	"encoding/hex"
 	"encoding/json"
 	"fmt"
-	"io"
-	"math/big"
-	"net"
-	"net/http"
-	"os"
-	"time"
 
-	"github.com/decred/go-socks/socks"
 	"github.com/decred/gominer/stratum"
-	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
-// newHTTPClient returns a new HTTP client that is configured according to the
-// proxy and TLS settings in the associated connection configuration.
-func newHTTPClient(cfg *config) (*http.Client, error) {
-	// Configure proxy if needed.
-	var dial func(network, addr string) (net.Conn, error)
-	if cfg.Proxy != "" {
-		proxy := &socks.Proxy{
-			Addr:     cfg.Proxy,
-			Username: cfg.ProxyUser,
-			Password: cfg.ProxyPass,
-		}
-		dial = func(network, addr string) (net.Conn, error) {
-			c, err := proxy.Dial(network, addr)
-			if err != nil {
-				return nil, err
-			}
-			return c, nil
-		}
-	}
-
-	// Configure TLS if needed.
-	var tlsConfig *tls.Config
-	if !cfg.NoTLS && cfg.RPCCert != "" {
-		pem, err := os.ReadFile(cfg.RPCCert)
-		if err != nil {
-			return nil, err
-		}
-
-		pool := x509.NewCertPool()
-		pool.AppendCertsFromPEM(pem)
-		tlsConfig = &tls.Config{
-			RootCAs:            pool,
-			InsecureSkipVerify: cfg.TLSSkipVerify,
-		}
-	}
-
-	// Create and return the new HTTP client potentially configured with a
-	// proxy and TLS.
-	client := http.Client{
-		Transport: &http.Transport{
-			Dial:            dial,
-			TLSClientConfig: tlsConfig,
-		},
-	}
-	return &client, nil
-}
-
-type getWorkResponseJson struct {
-	Result struct {
-		Data   string
-		Target string
-	}
-	Error *struct {
-		Code    int
-		Message string
-	}
-}
-
-type getWorkSubmitResponseJson struct {
-	Result bool
-	Error  *struct {
-		Code    int
-		Message string
-	}
-}
-
-const (
-	MaxIdleConnections int = 20
-	RequestTimeout     int = 5
-)
-
-// GetWork makes a getwork RPC call and returns the result (data and target).
-func GetWork() (*work.Work, error) {
-	// Generate a request to the configured RPC server.
-	protocol := "http"
-	if !cfg.NoTLS {
-		protocol = "https"
-	}
-	url := protocol + "://" + cfg.RPCServer
-	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": [], "id": 1}`)
-	bodyBuff := bytes.NewBuffer(jsonStr)
-	httpRequest, err := http.NewRequest("POST", url, bodyBuff)
-	if err != nil {
-		return nil, err
-	}
-	httpRequest.Close = true
-	httpRequest.Header.Set("Content-Type", "application/json")
-
-	// Configure basic access authorization.
-	httpRequest.SetBasicAuth(cfg.RPCUser, cfg.RPCPassword)
-
-	// Create the new HTTP client that is configured according to the user-
-	// specified options and submit the request.
-	httpClient, err := newHTTPClient(cfg)
-	if err != nil {
-		return nil, err
-	}
-	httpResponse, err := httpClient.Do(httpRequest)
-	if err != nil {
-		return nil, err
-	}
-
-	body, err := io.ReadAll(httpResponse.Body)
-	httpResponse.Body.Close()
-	if err != nil {
-		err = fmt.Errorf("error reading json reply: %w", err)
-		return nil, err
-	}
-
-	if httpResponse.Status != "200 OK" {
-		return nil, fmt.Errorf("http status %s: %s", httpResponse.Status, body)
-	}
-
-	var res getWorkResponseJson
-	err = json.Unmarshal(body, &res)
-	if err != nil {
-		return nil, err
-	}
-
-	if res.Error != nil {
-		return nil, fmt.Errorf("json error %d: %s", res.Error.Code,
-			res.Error.Message)
-	}
-
-	data, err := hex.DecodeString(res.Result.Data)
-	if err != nil {
-		return nil, err
-	}
-	if len(data) != 192 {
-		return nil, fmt.Errorf("wrong data length: got %d, expected 192",
-			len(data))
-	}
-	target, err := hex.DecodeString(res.Result.Target)
-	if err != nil {
-		return nil, err
-	}
-	if len(target) != 32 {
-		return nil, fmt.Errorf("wrong target length: got %d, expected 32",
-			len(target))
-	}
-
-	// The bigTarget difficulty is provided in little endian, but big integers
-	// expect big endian, so reverse it accordingly.
-	bigTarget := new(big.Int).SetBytes(util.Reverse(target))
-
-	var workData [192]byte
-	copy(workData[:], data)
-
-	const isGetWork = true
-	timestamp := binary.LittleEndian.Uint32(workData[128+4*work.TimestampWord:])
-	w := work.NewWork(workData, bigTarget, timestamp, uint32(time.Now().Unix()),
-		isGetWork)
-
-	w.Target = bigTarget
-
-	return w, nil
-}
-
 // GetPoolWork gets work from a stratum enabled pool.
 func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 	// Get Next work for stratum and mark it as used.
@@ -212,65 +41,6 @@ func GetPoolWork(pool *stratum.Stratum) (*work.Work, error) {
 	return nil, fmt.Errorf("no work available")
 }
 
-// GetWork makes a getwork RPC call and returns the result (data and target).
-func GetWorkSubmit(data []byte) (bool, error) {
-	// Generate a request to the configured RPC server.
-	protocol := "http"
-	if !cfg.NoTLS {
-		protocol = "https"
-	}
-	url := protocol + "://" + cfg.RPCServer
-	hexData := hex.EncodeToString(data)
-	jsonStr := []byte(`{"jsonrpc": "2.0", "method": "getwork", "params": ["` +
-		hexData + `"], "id": 1}`)
-	bodyBuff := bytes.NewBuffer(jsonStr)
-	httpRequest, err := http.NewRequest("POST", url, bodyBuff)
-	if err != nil {
-		return false, err
-	}
-	httpRequest.Close = true
-	httpRequest.Header.Set("Content-Type", "application/json")
-
-	// Configure basic access authorization.
-	httpRequest.SetBasicAuth(cfg.RPCUser, cfg.RPCPassword)
-
-	// Create the new HTTP client that is configured according to the user-
-	// specified options and submit the request.
-	httpClient, err := newHTTPClient(cfg)
-	if err != nil {
-		return false, err
-	}
-	httpResponse, err := httpClient.Do(httpRequest)
-	if err != nil {
-		return false, err
-	}
-
-	body, err := io.ReadAll(httpResponse.Body)
-	httpResponse.Body.Close()
-	if err != nil {
-		err = fmt.Errorf("error reading json reply: %w", err)
-		return false, err
-	}
-
-	if httpResponse.Status != "200 OK" {
-		return false, fmt.Errorf("error calling getwork (%s): %s",
-			httpResponse.Status, body)
-	}
-
-	var res getWorkSubmitResponseJson
-	err = json.Unmarshal(body, &res)
-	if err != nil {
-		return false, err
-	}
-
-	if res.Error != nil {
-		return false, fmt.Errorf("json error %d: %s", res.Error.Code,
-			res.Error.Message)
-	}
-
-	return res.Result, nil
-}
-
 // GetPoolWorkSubmit sends the result to the stratum enabled pool.
 func GetPoolWorkSubmit(data []byte, pool *stratum.Stratum) (bool, error) {
 	pool.Lock()
diff --git a/go.mod b/go.mod
index 4958cc8..2ccb26a 100644
--- a/go.mod
+++ b/go.mod
@@ -8,8 +8,8 @@ require (
 	github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0
 	github.com/decred/dcrd/chaincfg/chainhash v1.0.4
 	github.com/decred/dcrd/chaincfg/v3 v3.2.0
-	github.com/decred/dcrd/crypto/blake256 v1.0.1
 	github.com/decred/dcrd/dcrutil/v4 v4.0.1
+	github.com/decred/dcrd/rpcclient/v8 v8.0.0
 	github.com/decred/dcrd/wire v1.6.0
 	github.com/decred/go-socks v1.1.0
 	github.com/decred/slog v1.2.0
@@ -21,11 +21,18 @@ require (
 	github.com/agl/ed25519 v0.0.0-20170116200512-5312a6153412 // indirect
 	github.com/dchest/siphash v1.2.3 // indirect
 	github.com/decred/base58 v1.0.5 // indirect
+	github.com/decred/dcrd/blockchain/stake/v5 v5.0.0 // indirect
+	github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect
 	github.com/decred/dcrd/crypto/ripemd160 v1.0.2 // indirect
+	github.com/decred/dcrd/database/v3 v3.0.1 // indirect
 	github.com/decred/dcrd/dcrec v1.0.1 // indirect
 	github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3 // indirect
 	github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect
+	github.com/decred/dcrd/dcrjson/v4 v4.0.1 // indirect
+	github.com/decred/dcrd/gcs/v4 v4.0.0 // indirect
+	github.com/decred/dcrd/rpc/jsonrpc/types/v4 v4.0.0 // indirect
 	github.com/decred/dcrd/txscript/v4 v4.1.0 // indirect
+	github.com/gorilla/websocket v1.4.2 // indirect
 	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
 	golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4 // indirect
 	lukechampine.com/blake3 v1.2.1 // indirect
diff --git a/go.sum b/go.sum
index 967371e..1fe8313 100644
--- a/go.sum
+++ b/go.sum
@@ -8,6 +8,8 @@ github.com/dchest/siphash v1.2.3 h1:QXwFc8cFOR2dSa/gE6o/HokBMWtLUaNDVd+22aKHeEA=
 github.com/dchest/siphash v1.2.3/go.mod h1:0NvQU092bT0ipiFN++/rXm69QG9tVxLAlQHIXMPAkHc=
 github.com/decred/base58 v1.0.5 h1:hwcieUM3pfPnE/6p3J100zoRfGkQxBulZHo7GZfOqic=
 github.com/decred/base58 v1.0.5/go.mod h1:s/8lukEHFA6bUQQb/v3rjUySJ2hu+RioCzLukAVkrfw=
+github.com/decred/dcrd/blockchain/stake/v5 v5.0.0 h1:WyxS8zMvTMpC5qYC9uJY+UzuV/x9ko4z20qBtH5Hzzs=
+github.com/decred/dcrd/blockchain/stake/v5 v5.0.0/go.mod h1:5sSjMq9THpnrLkW0SjEqIBIo8qq2nXzc+m7k9oFVVmY=
 github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0 h1:v3yfo66axjr3oLihct+5tLEeM9YUzvK3i/6e2Im6RO0=
 github.com/decred/dcrd/blockchain/standalone/v2 v2.2.0/go.mod h1:JsOpl2nHhW2D2bWMEtbMuAE+mIU/Pdd1i1pmYR+2RYI=
 github.com/decred/dcrd/chaincfg/chainhash v1.0.4 h1:zRCv6tdncLfLTKYqu7hrXvs7hW+8FO/NvwoFvGsrluU=
@@ -18,14 +20,24 @@ github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5il
 github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo=
 github.com/decred/dcrd/crypto/ripemd160 v1.0.2 h1:TvGTmUBHDU75OHro9ojPLK+Yv7gDl2hnUvRocRCjsys=
 github.com/decred/dcrd/crypto/ripemd160 v1.0.2/go.mod h1:uGfjDyePSpa75cSQLzNdVmWlbQMBuiJkvXw/MNKRY4M=
+github.com/decred/dcrd/database/v3 v3.0.1 h1:oaklASAsUBwDoRgaS961WYqecFMZNhI1k+BmGgeW7/U=
+github.com/decred/dcrd/database/v3 v3.0.1/go.mod h1:IErr/Z62pFLoPZTMPGxedbcIuseGk0w3dszP3AFbXyw=
 github.com/decred/dcrd/dcrec v1.0.1 h1:gDzlndw0zYxM5BlaV17d7ZJV6vhRe9njPBFeg4Db2UY=
 github.com/decred/dcrd/dcrec v1.0.1/go.mod h1:CO+EJd8eHFb8WHa84C7ZBkXsNUIywaTHb+UAuI5uo6o=
 github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3 h1:l/lhv2aJCUignzls81+wvga0TFlyoZx8QxRMQgXpZik=
 github.com/decred/dcrd/dcrec/edwards/v2 v2.0.3/go.mod h1:AKpV6+wZ2MfPRJnTbQ6NPgWrKzbe9RCIlCF/FKzMtM8=
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs=
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0=
+github.com/decred/dcrd/dcrjson/v4 v4.0.1 h1:vyQuB1miwGqbCVNm8P6br3V65WQ6wyrh0LycMkvaBBg=
+github.com/decred/dcrd/dcrjson/v4 v4.0.1/go.mod h1:2qVikafVF9/X3PngQVmqkbUbyAl32uik0k/kydgtqMc=
 github.com/decred/dcrd/dcrutil/v4 v4.0.1 h1:E+d2TNbpOj0f1L9RqkZkEm1QolFjajvkzxWC5WOPf1s=
 github.com/decred/dcrd/dcrutil/v4 v4.0.1/go.mod h1:7EXyHYj8FEqY+WzMuRkF0nh32ueLqhutZDoW4eQ+KRc=
+github.com/decred/dcrd/gcs/v4 v4.0.0 h1:bet+Ax1ZFUqn2M0g1uotm0b8F6BZ9MmblViyJ088E8k=
+github.com/decred/dcrd/gcs/v4 v4.0.0/go.mod h1:9z+EBagzpEdAumwS09vf/hiGaR8XhNmsBgaVq6u7/NI=
+github.com/decred/dcrd/rpc/jsonrpc/types/v4 v4.0.0 h1:4YUKsWKrKlkhVMYGRB6G0XI6QfwUnwEH18eoEbM1/+M=
+github.com/decred/dcrd/rpc/jsonrpc/types/v4 v4.0.0/go.mod h1:dDHO7ivrPAhZjFD3LoOJN/kdq5gi0sxie6zCsWHAiUo=
+github.com/decred/dcrd/rpcclient/v8 v8.0.0 h1:O4B5d+8e2OjbeFW+c1XcZNQzyp++04ArWhXgYrsURus=
+github.com/decred/dcrd/rpcclient/v8 v8.0.0/go.mod h1:gx4+DI5apuOEeLwPBJFlMoj3GFWq1I7/X8XCQmMTi8Q=
 github.com/decred/dcrd/txscript/v4 v4.1.0 h1:uEdcibIOl6BuWj3AqmXZ9xIK/qbo6lHY9aNk29FtkrU=
 github.com/decred/dcrd/txscript/v4 v4.1.0/go.mod h1:OVguPtPc4YMkgssxzP8B6XEMf/J3MB6S1JKpxgGQqi0=
 github.com/decred/dcrd/wire v1.6.0 h1:YOGwPHk4nzGr6OIwUGb8crJYWDiVLpuMxfDBCCF7s/o=
@@ -34,12 +46,16 @@ github.com/decred/go-socks v1.1.0 h1:dnENcc0KIqQo3HSXdgboXAHgqsCIutkqq6ntQjYtm2U
 github.com/decred/go-socks v1.1.0/go.mod h1:sDhHqkZH0X4JjSa02oYOGhcGHYp12FsY1jQ/meV8md0=
 github.com/decred/slog v1.2.0 h1:soHAxV52B54Di3WtKLfPum9OFfWqwtf/ygf9njdfnPM=
 github.com/decred/slog v1.2.0/go.mod h1:kVXlGnt6DHy2fV5OjSeuvCJ0OmlmTF6LFpEPMu/fOY0=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc=
+github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
 github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
 github.com/jrick/logrotate v1.0.0 h1:lQ1bL/n9mBNeIXoTUoYRlK4dHuNJVofX9oWqBtPnSzI=
 github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
 github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 h1:epCh84lMvA70Z7CTTCmYQn2CKbY8j86K7/FAIr141uY=
 golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4 h1:EZ2mChiOa8udjfp6rRmswTbtZN/QzUQp4ptM4rnjHvc=
 golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI=
diff --git a/main.go b/main.go
index bd22b3a..f2f6e12 100644
--- a/main.go
+++ b/main.go
@@ -75,18 +75,15 @@ func gominerMain() error {
 		}()
 	}
 
-	m, err := NewMiner()
+	ctx := shutdownListener()
+	m, err := NewMiner(ctx)
 	if err != nil {
 		mainLog.Criticalf("Error initializing miner: %v", err)
 		return err
 	}
-
 	if len(cfg.APIListeners) != 0 {
 		go RunMonitor(m)
 	}
-
-	ctx := shutdownListener()
-
 	m.Run(ctx)
 
 	return nil
diff --git a/miner.go b/miner.go
index 2c7aead..126ce0b 100644
--- a/miner.go
+++ b/miner.go
@@ -4,15 +4,21 @@ package main
 
 import (
 	"context"
+	"encoding/binary"
+	"encoding/hex"
 	"errors"
 	"fmt"
+	"math/big"
+	"os"
 	"sync"
 	"sync/atomic"
 	"time"
 
 	"github.com/decred/dcrd/chaincfg/chainhash"
 	"github.com/decred/dcrd/crypto/blake256"
+	"github.com/decred/dcrd/rpcclient/v8"
 	"github.com/decred/gominer/stratum"
+	"github.com/decred/gominer/util"
 	"github.com/decred/gominer/work"
 )
 
@@ -28,34 +34,114 @@ type Miner struct {
 	needsWorkRefresh chan struct{}
 	wg               sync.WaitGroup
 	pool             *stratum.Stratum
+
+	rpc *rpcclient.Client
 }
 
-func NewMiner() (*Miner, error) {
+func newStratum(devices []*Device) (*Miner, error) {
+	s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version(), chainParams)
+	if err != nil {
+		return nil, err
+	}
 	m := &Miner{
-		workDone:         make(chan []byte, 10),
+		devices:          devices,
+		pool:             s,
 		needsWorkRefresh: make(chan struct{}),
 	}
+	return m, nil
+}
 
-	// If needed, start pool code.
-	if cfg.Pool != "" && !cfg.Benchmark {
-		s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword,
-			cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version(), chainParams)
-		if err != nil {
-			return nil, err
-		}
-		m.pool = s
+func newSoloMiner(ctx context.Context, devices []*Device) (*Miner, error) {
+	var rpc *rpcclient.Client
+	ntfnHandlers := rpcclient.NotificationHandlers{
+		OnBlockConnected: func(blockHeader []byte, transactions [][]byte) {
+			minrLog.Infof("Block connected: %x (%d transactions)", blockHeader, len(transactions))
+		},
+		OnBlockDisconnected: func(blockHeader []byte) {
+			minrLog.Infof("Block disconnected: %x", blockHeader)
+		},
+		OnWork: func(data, target []byte, reason string) {
+			minrLog.Infof("Work received: %x %x %s", data, target, reason)
+
+			// The bigTarget difficulty is provided in little endian, but big integers
+			// expect big endian, so reverse it accordingly.
+			bigTarget := new(big.Int).SetBytes(util.Reverse(target))
+
+			var workData [192]byte
+			copy(workData[:], data)
+
+			const isGetWork = true
+			timestamp := binary.LittleEndian.Uint32(workData[128+4*work.TimestampWord:])
+			w := work.NewWork(workData, bigTarget, timestamp, uint32(time.Now().Unix()),
+				isGetWork)
+
+			// Solo
+			for _, d := range devices {
+				d.SetWork(ctx, w)
+			}
+		},
+	}
+	// Connect to local dcrd RPC server using websockets.
+	certs, err := os.ReadFile(cfg.RPCCert)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read rpc certificate %v: %w",
+			cfg.RPCCert, err)
 	}
 
-	devices, err := newMinerDevs(m.workDone)
+	connCfg := &rpcclient.ConnConfig{
+		Host:         cfg.RPCServer,
+		Endpoint:     "ws",
+		User:         cfg.RPCUser,
+		Pass:         cfg.RPCPassword,
+		Certificates: certs,
+		Proxy:        cfg.Proxy,
+		ProxyUser:    cfg.ProxyUser,
+		ProxyPass:    cfg.ProxyPass,
+	}
+	rpc, err = rpcclient.New(connCfg, &ntfnHandlers)
 	if err != nil {
 		return nil, err
 	}
+	err = rpc.NotifyWork(ctx)
+	if err != nil {
+		rpc.Shutdown()
+		return nil, err
+	}
+	err = rpc.NotifyBlocks(ctx)
+	if err != nil {
+		rpc.Shutdown()
+		return nil, err
+	}
+	m := &Miner{
+		devices: devices,
+		rpc:     rpc,
+	}
 
+	return m, nil
+}
+
+func NewMiner(ctx context.Context) (*Miner, error) {
+	workDone := make(chan []byte, 10)
+
+	devices, err := newMinerDevs(workDone)
+	if err != nil {
+		return nil, err
+	}
 	if len(devices) == 0 {
 		return nil, fmt.Errorf("no devices started")
 	}
 
-	m.devices = devices
+	var m *Miner
+	if cfg.Pool == "" {
+		m, err = newSoloMiner(ctx, devices)
+	} else {
+		m, err = newStratum(devices)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	m.workDone = workDone
 	m.started = uint32(time.Now().Unix())
 
 	return m, nil
@@ -71,24 +157,20 @@ func (m *Miner) workSubmitThread(ctx context.Context) {
 		case data := <-m.workDone:
 			// Only use that is we are not using a pool.
 			if m.pool == nil {
-				accepted, err := GetWorkSubmit(data)
+				// Solo
+				accepted, err := m.rpc.GetWorkSubmit(ctx, hex.EncodeToString(data))
 				if err != nil {
 					atomic.AddUint64(&m.invalidShares, 1)
-					minrLog.Errorf("Error submitting work: %v", err)
-				} else {
-					if accepted {
-						atomic.AddUint64(&m.validShares, 1)
-						minrLog.Infof("Submitted work successfully: block hash %v",
-							chainhash.Hash(blake256.Sum256(data[:180])))
-					} else {
-						atomic.AddUint64(&m.invalidShares, 1)
-					}
-
-					select {
-					case m.needsWorkRefresh <- struct{}{}:
-					case <-ctx.Done():
-					}
+					minrLog.Errorf("failed to submit work: %w", err)
+					continue
+				} else if !accepted {
+					atomic.AddUint64(&m.invalidShares, 1)
+					minrLog.Error("work not accepted")
+					continue
 				}
+				atomic.AddUint64(&m.validShares, 1)
+				minrLog.Infof("Submitted work successfully: block hash %v",
+					chainhash.Hash(blake256.Sum256(data[:180])))
 			} else {
 				submitted, err := GetPoolWorkSubmit(data, m.pool)
 				if err != nil {
@@ -124,32 +206,22 @@ func (m *Miner) workRefreshThread(ctx context.Context) {
 	defer t.Stop()
 
 	for {
-		// Only use that is we are not using a pool.
-		if m.pool == nil {
-			work, err := GetWork()
+		// Stratum only code.
+		m.pool.Lock()
+		if m.pool.PoolWork.NewWork {
+			work, err := GetPoolWork(m.pool)
+			m.pool.Unlock()
 			if err != nil {
-				minrLog.Errorf("Error in getwork: %v", err)
+				minrLog.Errorf("Error in getpoolwork: %v", err)
 			} else {
 				for _, d := range m.devices {
 					d.SetWork(ctx, work)
 				}
 			}
 		} else {
-			m.pool.Lock()
-			if m.pool.PoolWork.NewWork {
-				work, err := GetPoolWork(m.pool)
-				m.pool.Unlock()
-				if err != nil {
-					minrLog.Errorf("Error in getpoolwork: %v", err)
-				} else {
-					for _, d := range m.devices {
-						d.SetWork(ctx, work)
-					}
-				}
-			} else {
-				m.pool.Unlock()
-			}
+			m.pool.Unlock()
 		}
+
 		select {
 		case <-ctx.Done():
 			return
@@ -227,7 +299,7 @@ func (m *Miner) Run(ctx context.Context) {
 		for _, d := range m.devices {
 			d.SetWork(ctx, work)
 		}
-	} else {
+	} else if m.pool != nil {
 		m.wg.Add(1)
 		go m.workRefreshThread(ctx)
 	}

From c504f85d26230e4f19a6b1f1b2931e51271fc8b5 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Thu, 12 Oct 2023 13:43:55 -0500
Subject: [PATCH 146/150] Perform initial getwork when solo mining.

This ensure work is immediately available when solo mining by making an
initial call to getwork during the initial setup of the miner.

It entails refactoring the work preparation logic from the OnWork
handler into a separate function that is called with the result of the
initial getwork call as well as all future OnWork notifications.
---
 miner.go | 65 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/miner.go b/miner.go
index 126ce0b..67cad83 100644
--- a/miner.go
+++ b/miner.go
@@ -51,6 +51,30 @@ func newStratum(devices []*Device) (*Miner, error) {
 	return m, nil
 }
 
+// onSoloWork prepares the provided getwork-based work data, which might have
+// either come from getwork directly or from asynchronous work notifications,
+// and updates all of the provided devices with that prepared work.
+func onSoloWork(ctx context.Context, data, target []byte, reason string, devices []*Device) {
+	minrLog.Debugf("Work received: (data: %x, target: %x, reason: %s)", data,
+		target, reason)
+
+	// The bigTarget difficulty is provided in little endian, but big integers
+	// expect big endian, so reverse it accordingly.
+	bigTarget := new(big.Int).SetBytes(util.Reverse(target))
+
+	var workData [192]byte
+	copy(workData[:], data)
+
+	const isGetWork = true
+	timestamp := binary.LittleEndian.Uint32(workData[128+4*work.TimestampWord:])
+	w := work.NewWork(workData, bigTarget, timestamp, uint32(time.Now().Unix()),
+		isGetWork)
+
+	for _, d := range devices {
+		d.SetWork(ctx, w)
+	}
+}
+
 func newSoloMiner(ctx context.Context, devices []*Device) (*Miner, error) {
 	var rpc *rpcclient.Client
 	ntfnHandlers := rpcclient.NotificationHandlers{
@@ -61,24 +85,7 @@ func newSoloMiner(ctx context.Context, devices []*Device) (*Miner, error) {
 			minrLog.Infof("Block disconnected: %x", blockHeader)
 		},
 		OnWork: func(data, target []byte, reason string) {
-			minrLog.Infof("Work received: %x %x %s", data, target, reason)
-
-			// The bigTarget difficulty is provided in little endian, but big integers
-			// expect big endian, so reverse it accordingly.
-			bigTarget := new(big.Int).SetBytes(util.Reverse(target))
-
-			var workData [192]byte
-			copy(workData[:], data)
-
-			const isGetWork = true
-			timestamp := binary.LittleEndian.Uint32(workData[128+4*work.TimestampWord:])
-			w := work.NewWork(workData, bigTarget, timestamp, uint32(time.Now().Unix()),
-				isGetWork)
-
-			// Solo
-			for _, d := range devices {
-				d.SetWork(ctx, w)
-			}
+			onSoloWork(ctx, data, target, reason, devices)
 		},
 	}
 	// Connect to local dcrd RPC server using websockets.
@@ -144,6 +151,28 @@ func NewMiner(ctx context.Context) (*Miner, error) {
 	m.workDone = workDone
 	m.started = uint32(time.Now().Unix())
 
+	// Perform an initial call to getwork when solo mining so work is available
+	// immediately.
+	if cfg.Pool == "" {
+		workResult, err := m.rpc.GetWork(ctx)
+		if err != nil {
+			m.rpc.Shutdown()
+			return nil, fmt.Errorf("unable to retrieve initial work: %w", err)
+		}
+
+		data, err := hex.DecodeString(workResult.Data)
+		if err != nil {
+			m.rpc.Shutdown()
+			return nil, fmt.Errorf("unable to decode work data: %w", err)
+		}
+		target, err := hex.DecodeString(workResult.Target)
+		if err != nil {
+			m.rpc.Shutdown()
+			return nil, fmt.Errorf("unable to decode work target: %w", err)
+		}
+		onSoloWork(ctx, data, target, "initialwork", devices)
+	}
+
 	return m, nil
 }
 

From 9bbf455ec350e48ff33773cea2151e887b320292 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Tue, 10 Oct 2023 18:34:15 -0500
Subject: [PATCH 147/150] version: Support single override and add git hash.

This reworks the way versions are handled internally to match other
Decred software.

In particular, it reverses the semantics such that the individual semver
components (major, minor, patch, prerelease, and buildmetadata) are
parsed from a full string and exported at init time.

Also, since the version is now parsed and verified to be accurate, it
updates the pre-release parsing to properly support dots as required by
the spec.

It adds the git commit hash to the version string as buildmetadata when
no buildmetadata is otherwise specified.

This provides a few main benefits:

- Allows a single linker override to fully specify the version string
  instead of having separate ones that can only override the prerelease
  and build metadata portions
- Provides run-time checks to ensure the full version string is valid
  per the semver spec regardless of whether it was specified directly in
  the source or provided via the linker
- The exact commit used to build non-release versions will be in the
  version string by default

Finally, while here, add some comments regarding the release process to
help maintainers.
---
 config.go  |   3 +-
 main.go    |   4 +-
 miner.go   |   3 +-
 version.go | 201 +++++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 155 insertions(+), 56 deletions(-)

diff --git a/config.go b/config.go
index bba811f..fc4fcfc 100644
--- a/config.go
+++ b/config.go
@@ -318,7 +318,8 @@ func loadConfig() (*config, []string, error) {
 	}
 
 	if preCfg.ShowVersion {
-		fmt.Printf("%s %s version %s (Go version %s)\n", appName, gpuLib(), version(), runtime.Version())
+		fmt.Printf("%s %s version %s (Go version %s %s/%s)\n", appName, gpuLib(),
+			Version, runtime.Version(), runtime.GOOS, runtime.GOARCH)
 		os.Exit(0)
 	}
 
diff --git a/main.go b/main.go
index f2f6e12..924a750 100644
--- a/main.go
+++ b/main.go
@@ -28,8 +28,8 @@ func gominerMain() error {
 	}()
 
 	// Show version at startup.
-	mainLog.Infof("Version %s %s (Go version %s)",
-		version(), gpuLib(), runtime.Version())
+	mainLog.Infof("Version %s %s (Go version %s %s/%s)", Version, gpuLib(),
+		runtime.Version(), runtime.GOOS, runtime.GOARCH)
 
 	// Enable http profiling server if requested.
 	if cfg.Profile != "" {
diff --git a/miner.go b/miner.go
index 67cad83..495a094 100644
--- a/miner.go
+++ b/miner.go
@@ -39,7 +39,8 @@ type Miner struct {
 }
 
 func newStratum(devices []*Device) (*Miner, error) {
-	s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword, cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, version(), chainParams)
+	s, err := stratum.StratumConn(cfg.Pool, cfg.PoolUser, cfg.PoolPassword,
+		cfg.Proxy, cfg.ProxyUser, cfg.ProxyPass, Version, chainParams)
 	if err != nil {
 		return nil, err
 	}
diff --git a/version.go b/version.go
index ddca950..1fcf61c 100644
--- a/version.go
+++ b/version.go
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2013, 2014 The btcsuite developers
- * Copyright (c) 2015 The Decred developers
+ * Copyright (c) 2015-2023 The Decred developers
  * Copyright (c) 2016 Dario Nieuwenhuis
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -19,72 +19,169 @@
 package main
 
 import (
-	"bytes"
 	"fmt"
+	"regexp"
+	"runtime/debug"
+	"strconv"
 	"strings"
 )
 
-const semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-"
-
-// These constants define the application version and follow the semantic
-// versioning 2.0.0 spec (http://semver.org/).
 const (
-	appMajor uint = 1
-	appMinor uint = 0
-	appPatch uint = 0
+	// semanticAlphabet defines the allowed characters for the pre-release and
+	// build metadata portions of a semantic version string.
+	semanticAlphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-."
+)
+
+// semverRE is a regular expression used to parse a semantic version string into
+// its constituent parts.
+var semverRE = regexp.MustCompile(`^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)` +
+	`(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*` +
+	`[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$`)
+
+// These variables define the application version and follow the semantic
+// versioning 2.0.0 spec (https://semver.org/).
+var (
+	// Note for maintainers:
+	//
+	// The expected process for setting the version in releases is as follows:
+	// - Create a release branch of the form 'release-vMAJOR.MINOR'
+	// - Modify the Version variable below on that branch to:
+	//   - Remove the pre-release portion
+	//   - Set the build metadata to 'release.local'
+	// - Update the Version variable below on the master branch to the next
+	//   expected version while retaining a pre-release of 'pre'
+	//
+	// These steps ensure that building from source produces versions that are
+	// distinct from reproducible builds that override the Version via linker
+	// flags.
 
-	// appPreRelease MUST only contain characters from semanticAlphabet
-	// per the semantic versioning spec.
-	appPreRelease = "beta"
+	// Version is the application version per the semantic versioning 2.0.0 spec
+	// (https://semver.org/).
+	//
+	// It is defined as a variable so it can be overridden during the build
+	// process with:
+	// '-ldflags "-X main.Version=fullsemver"'
+	// if needed.
+	//
+	// It MUST be a full semantic version per the semantic versioning spec or
+	// the app will panic at runtime.  Of particular note is the pre-release
+	// and build metadata portions MUST only contain characters from
+	// semanticAlphabet.
+	Version = "2.0.0-pre"
+
+	// NOTE: The following values are set via init by parsing the above Version
+	// string.
+
+	// These fields are the individual semantic version components that define
+	// the application version.
+	Major         uint32
+	Minor         uint32
+	Patch         uint32
+	PreRelease    string
+	BuildMetadata string
 )
 
-// appBuild is defined as a variable so it can be overridden during the build
-// process with '-ldflags "-X main.appBuild=foo' if needed.  It MUST only
-// contain characters from semanticAlphabet per the semantic versioning spec.
-var appBuild = "dev"
-
-// version returns the application version as a properly formed string per the
-// semantic versioning 2.0.0 spec (http://semver.org/).
-func version() string {
-	// Start with the major, minor, and path versions.
-	version := fmt.Sprintf("%d.%d.%d", appMajor, appMinor, appPatch)
-
-	// Append pre-release version if there is one.  The hyphen called for
-	// by the semantic versioning spec is automatically appended and should
-	// not be contained in the pre-release string.  The pre-release version
-	// is not appended if it contains invalid characters.
-	preRelease := normalizeVerString(appPreRelease)
-	if preRelease != "" {
-		version = fmt.Sprintf("%s-%s", version, preRelease)
+// parseUint32 converts the passed string to an unsigned integer or returns an
+// error if it is invalid.
+func parseUint32(s string, fieldName string) (uint32, error) {
+	val, err := strconv.ParseUint(s, 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("malformed semver %s: %w", fieldName, err)
+	}
+	return uint32(val), err
+}
+
+// checkSemString returns an error if the passed string contains characters that
+// are not in the provided alphabet.
+func checkSemString(s, alphabet, fieldName string) error {
+	for _, r := range s {
+		if !strings.ContainsRune(alphabet, r) {
+			return fmt.Errorf("malformed semver %s: %q invalid", fieldName, r)
+		}
+	}
+	return nil
+}
+
+// parseSemVer parses various semver components from the provided string.
+func parseSemVer(s string) (uint32, uint32, uint32, string, string, error) {
+	// Parse the various semver component from the version string via a regular
+	// expression.
+	m := semverRE.FindStringSubmatch(s)
+	if m == nil {
+		err := fmt.Errorf("malformed version string %q: does not conform to "+
+			"semver specification", s)
+		return 0, 0, 0, "", "", err
+	}
+
+	major, err := parseUint32(m[1], "major")
+	if err != nil {
+		return 0, 0, 0, "", "", err
 	}
 
-	// Append build metadata if there is any.  The plus called for
-	// by the semantic versioning spec is automatically appended and should
-	// not be contained in the build metadata string.  The build metadata
-	// string is not appended if it contains invalid characters.
-	build := normalizeVerString(appBuild)
-	if build != "" {
-		version = fmt.Sprintf("%s+%s", version, build)
+	minor, err := parseUint32(m[2], "minor")
+	if err != nil {
+		return 0, 0, 0, "", "", err
 	}
 
-	return version
+	patch, err := parseUint32(m[3], "patch")
+	if err != nil {
+		return 0, 0, 0, "", "", err
+	}
+
+	preRel := m[4]
+	err = checkSemString(preRel, semanticAlphabet, "pre-release")
+	if err != nil {
+		return 0, 0, 0, "", "", err
+	}
+
+	build := m[5]
+	err = checkSemString(build, semanticAlphabet, "buildmetadata")
+	if err != nil {
+		return 0, 0, 0, "", "", err
+	}
+
+	return major, minor, patch, preRel, build, nil
+}
+
+// vcsCommitID attempts to return the version control system short commit hash
+// that was used to build the binary.  It currently only detects git commits.
+func vcsCommitID() string {
+	bi, ok := debug.ReadBuildInfo()
+	if !ok {
+		return ""
+	}
+	var vcs, revision string
+	for _, bs := range bi.Settings {
+		switch bs.Key {
+		case "vcs":
+			vcs = bs.Value
+		case "vcs.revision":
+			revision = bs.Value
+		}
+	}
+	if vcs == "" {
+		return ""
+	}
+	if vcs == "git" && len(revision) > 9 {
+		revision = revision[:9]
+	}
+	return revision
 }
 
-// normalizeVerString returns the passed string stripped of all characters which
-// are not valid according to the semantic versioning guidelines for pre-release
-// version and build metadata strings.  In particular they MUST only contain
-// characters in semanticAlphabet.
-func normalizeVerString(str string) string {
-	result := bytes.Buffer{}
-	for _, r := range str {
-		if strings.ContainsRune(semanticAlphabet, r) {
-			_, err := result.WriteRune(r)
-			// Writing to a bytes.Buffer panics on OOM, and all
-			// errors are unexpected.
-			if err != nil {
-				panic(err)
+func init() {
+	var err error
+	Major, Minor, Patch, PreRelease, BuildMetadata, err = parseSemVer(Version)
+	if err != nil {
+		panic(err)
+	}
+	if BuildMetadata == "" {
+		BuildMetadata = vcsCommitID()
+		if BuildMetadata != "" {
+			Version = fmt.Sprintf("%d.%d.%d", Major, Minor, Patch)
+			if PreRelease != "" {
+				Version += "-" + PreRelease
 			}
+			Version += "+" + BuildMetadata
 		}
 	}
-	return result.String()
 }

From 2d62142be81d43f147bc1254946f90e5110de80b Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Thu, 12 Oct 2023 23:43:48 -0500
Subject: [PATCH 148/150] README: Update for pool support.

This updates the README to call out support for pool mining and adds
explicit instructions for dcrpool as well as a general pool mining
section.
---
 README.md | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 595078b..4498076 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,8 @@
 gominer is an application for performing Proof-of-Work (PoW) mining on the
 Decred network after the activation of
 [DCP0011](https://github.com/decred/dcps/blob/master/dcp-0011/dcp-0011.mediawiki)
-using BLAKE3.  It supports solo mining using OpenCL and CUDA devices.
+using BLAKE3.  It supports solo and stratum/pool mining using OpenCL and CUDA
+devices.
 
 [User Reported Hashrates](#user-reported-hashrates)
 
@@ -50,7 +51,7 @@ aforementioned `dcrd.conf` file and restarting `dcrd`.
 your setup.
 
 ```
-./gominer -B
+gominer -B
 ```
 
 ### Solo Mining on Mainnet
@@ -60,7 +61,35 @@ credentials as well as `dcrd` with a `miningaddr`.  Once the credentials and
 mining address have been configured, simply run gominer to begin mining.
 
 ```
-./gominer
+gominer
+```
+
+### Stratum/pool Mining on Mainnet
+
+#### Mining with a Pool Based on Dcrpool
+
+The username for pools running [dcrpool](https://github.com/decred/dcrpool) is
+the payment address for receiving rewards and a unique name identifying the
+client formatted as `address.name`.
+
+Run the following command replacing the `pooldomain:port` with the appropriate
+domain name and port of the desired pool to connect to and the `address.name`
+as previously described:
+
+```
+gominer --pool stratum+tcp://pooldomain:port --pooluser address.name
+```
+
+#### General Pool Mining
+
+There is no other known pool software aside from
+[dcrpool](https://github.com/decred/dcrpool), that supports the latest Decred
+consensus rules at the current time.  However, as long as the pool software
+supports the stratum protocol with the same semantics implemented by `dcrpool`,
+the following command should serve as a starting point:
+
+```
+gominer --pool stratum+tcp://pooldomain:port --pooluser username --poolpass password
 ```
 
 ## Status API

From b01524dd1f7415d8007a3ed30a53a1c908a04e14 Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Sat, 14 Oct 2023 00:39:16 -0500
Subject: [PATCH 149/150] release: Bump for 2.1 release cycle.

---
 version.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.go b/version.go
index 1fcf61c..69c0075 100644
--- a/version.go
+++ b/version.go
@@ -67,7 +67,7 @@ var (
 	// the app will panic at runtime.  Of particular note is the pre-release
 	// and build metadata portions MUST only contain characters from
 	// semanticAlphabet.
-	Version = "2.0.0-pre"
+	Version = "2.1.0-pre"
 
 	// NOTE: The following values are set via init by parsing the above Version
 	// string.

From 68791b06465250db861e8d7ce9c6b5b2ccdb074b Mon Sep 17 00:00:00 2001
From: Matheus Degiovani <opensource@matheusd.com>
Date: Sat, 20 Jan 2024 09:57:12 -0300
Subject: [PATCH 150/150] miner: Remove need for dcrd in benchmark mode.

This makes it possible to run gominer in benchmark mode (-B) without
having setup a dcrd instance.  This is particularly useful when
comissioning new machines or testing the software, so that users do not
have to setup an entire simnet environment or wait for a full mainnet
dcrd and dcrwallet sync.
---
 miner.go | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/miner.go b/miner.go
index 495a094..58fdce3 100644
--- a/miner.go
+++ b/miner.go
@@ -128,6 +128,10 @@ func newSoloMiner(ctx context.Context, devices []*Device) (*Miner, error) {
 	return m, nil
 }
 
+func newBenchmarkMiner(devices []*Device) *Miner {
+	return &Miner{devices: devices}
+}
+
 func NewMiner(ctx context.Context) (*Miner, error) {
 	workDone := make(chan []byte, 10)
 
@@ -140,9 +144,12 @@ func NewMiner(ctx context.Context) (*Miner, error) {
 	}
 
 	var m *Miner
-	if cfg.Pool == "" {
+	switch {
+	case cfg.Benchmark:
+		m = newBenchmarkMiner(devices)
+	case cfg.Pool == "":
 		m, err = newSoloMiner(ctx, devices)
-	} else {
+	default:
 		m, err = newStratum(devices)
 	}
 	if err != nil {
@@ -152,6 +159,12 @@ func NewMiner(ctx context.Context) (*Miner, error) {
 	m.workDone = workDone
 	m.started = uint32(time.Now().Unix())
 
+	// Return early on benchmark mode to avoid requiring a dcrd instance to
+	// be running.
+	if cfg.Benchmark {
+		return m, nil
+	}
+
 	// Perform an initial call to getwork when solo mining so work is available
 	// immediately.
 	if cfg.Pool == "" {