diff --git a/Makefile b/Makefile index 4928885..ac72fd7 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ else FINAL_FLAGS := -g $(WARNING_FLAGS) $(DEBUG_FLAGS) endif -PGMS := tile-count-create tile-count-decode tile-count-tile tile-count-merge +PGMS := tile-count-create tile-count-decode tile-count-tile tile-count-merge tile-count-grep tile-count-normalize all: $(PGMS) @@ -46,6 +46,12 @@ tile-count-tile: tippecanoe/projection.o tile.o header.o serial.o tippecanoe/mbt tile-count-merge: mergetool.o header.o serial.o merge.o $(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread +tile-count-grep: tippecanoe/projection.o grep.o header.o serial.o jsonpull/jsonpull.o + $(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread + +tile-count-normalize: normalize.o header.o serial.o + $(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread + -include $(wildcard *.d) %.o: %.c diff --git a/README.md b/README.md index d5b59c6..2814dbd 100644 --- a/README.md +++ b/README.md @@ -123,8 +123,7 @@ instead of merging existing tilesets. The *maxzoom* plus the *detail* always equ * `-p` *cpus*: Use the specified number of parallel tasks. * `-q`: Silence the progress indicator -Relationship between bin size, maxzoom, and detail --------------------------------------------------- +## Relationship between bin size, maxzoom, and detail What exactly the "detail" parameter means is often the source of confusion. It is the difference between the maxzoom and the bin size. @@ -142,6 +141,28 @@ to the maxzoom: if you have data with a bin size of 24, and you want 256x256 tiles, 2^8=256 so you should specify a detail of 8, and the maxzoom will be 16 because 24-8=16. +Matching GeoJSON against count files +------------------------------------ + + tile-count-grep [-v] [-s binsize] in.count [file.json ...] + +matches GeoJSON objects on the standard input or the specified `.json` files against +locations in `in.count` and writes the ones that contain a previously counted location +to the standard output as newline-delimited GeoJSON. + +### Options + +* `-s` *binsize*: Specify the zoom level tile size for the precision of matching. +* `-v`: Reverse the sense of matching: Output the GeoJSON objects that *don't* match. + +Normalizing densities +--------------------- + + tile-count-normalize -s binsize -o out.count in.count + +Finds the densest bin of size *binsize* in `in.count` and writes out a new file `out.count` +in which the values are scaled up so that all populated bins of this size are equally dense. + Internal file format -------------------- diff --git a/grep.cpp b/grep.cpp new file mode 100644 index 0000000..0fb7ab6 --- /dev/null +++ b/grep.cpp @@ -0,0 +1,206 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "tippecanoe/projection.hpp" +#include "header.hpp" +#include "serial.hpp" +#include "merge.hpp" + +extern "C" { +#include "jsonpull/jsonpull.h" +} + +bool quiet = false; +unsigned long long mask = 0; + +void usage(char **argv) { + fprintf(stderr, "Usage: %s [-v] [-s binsize] in.count [in.json ...]\n", argv[0]); +} + +int maskcmp(const void *a, const void *b) { + unsigned char *ca = (unsigned char *) a; + unsigned char *cb = (unsigned char *) b; + + unsigned long la = read64(ca) & mask; + unsigned long lb = read64(cb) & mask; + + if (la < lb) { + return -1; + } else if (la > lb) { + return 1; + } else { + return 0; + } +} + +bool check_coordinates(json_pull *jp, json_object *j, json_object *coordinates, const char *map, size_t maplen) { + if (coordinates->type == JSON_ARRAY) { + if (coordinates->length >= 2 && coordinates->array[0]->type == JSON_NUMBER && + coordinates->array[1]->type == JSON_NUMBER) { + long long x, y; + projection->project(coordinates->array[0]->number, coordinates->array[1]->number, 32, &x, &y); + unsigned long long index = encode(x, y); + + unsigned char ix[INDEX_BYTES]; + unsigned char *p = ix; + write64(&p, index); + + void *found = bsearch(ix, map + HEADER_LEN, (maplen - HEADER_LEN) / RECORD_BYTES, RECORD_BYTES, maskcmp); + + if (found != NULL) { + return true; + } + } else { + for (size_t i = 0; i < coordinates->length; i++) { + if (check_coordinates(jp, j, coordinates->array[i], map, maplen)) { + return true; + } + } + } + } + + return false; +} + +void read_json(FILE *in, const char *fname, long long &seq, const char *map, size_t maplen, bool reverse) { + json_pull *jp = json_begin_file(in); + + while (1) { + json_object *j = json_read(jp); + if (j == NULL) { + if (jp->error != NULL) { + fprintf(stderr, "%s:%d: %s\n", fname, jp->line, jp->error); + } + + json_free(jp->root); + break; + } + + json_object *type = json_hash_get(j, "type"); + if (type == NULL || type->type != JSON_STRING) { + continue; + } + + if (strcmp(type->string, "Feature") != 0) { + if (strcmp(type->string, "FeatureCollection") == 0) { + json_free(j); + } + + continue; + } + + json_object *geometry = json_hash_get(j, "geometry"); + if (geometry == NULL) { + fprintf(stderr, "%s:%d: feature with no geometry\n", fname, jp->line); + json_free(j); + continue; + } + + json_object *coordinates = json_hash_get(geometry, "coordinates"); + if (coordinates == NULL) { + fprintf(stderr, "%s:%d: geometry with no coordinates\n", fname, jp->line); + json_free(j); + continue; + } + + if (seq % 100000 == 0) { + if (!quiet) { + fprintf(stderr, "Read %.1f million records\r", seq / 1000000.0); + } + } + seq++; + + if (check_coordinates(jp, j, coordinates, map, maplen) != reverse) { + const char *out = json_stringify(j); + printf("%s\n", out); + free((void *) out); + } + + json_free(j); + } + + json_end(jp); +} + +int main(int argc, char **argv) { + extern int optind; + extern char *optarg; + + bool reverse = false; + int zoom = 32; + size_t cpus = sysconf(_SC_NPROCESSORS_ONLN); + + int i; + while ((i = getopt(argc, argv, "s:vq")) != -1) { + switch (i) { + case 's': + zoom = atoi(optarg); + break; + + case 'v': + reverse = true; + break; + + case 'q': + quiet = true; + break; + + default: + usage(argv); + exit(EXIT_FAILURE); + } + } + + if (optind >= argc) { + usage(argv); + exit(EXIT_FAILURE); + } + + if (zoom != 0) { + mask = 0xFFFFFFFFFFFFFFFFULL << (64 - 2 * zoom); + } + + const char *countfname = argv[optind]; + int fd = open(countfname, O_RDONLY); + if (fd < 0) { + perror(countfname); + exit(EXIT_FAILURE); + } + + struct stat st; + if (fstat(fd, &st) != 0) { + perror(countfname); + exit(EXIT_FAILURE); + } + + const char *map = (const char *) mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) { + perror(countfname); + exit(EXIT_FAILURE); + } + + optind++; + + long long seq = 0; + if (optind == argc) { + read_json(stdin, "standard input", seq, map, st.st_size, reverse); + } else { + for (; optind < argc; optind++) { + FILE *in = fopen(argv[optind], "r"); + if (in == NULL) { + perror(argv[optind]); + exit(EXIT_FAILURE); + } else { + read_json(in, argv[optind], seq, map, st.st_size, reverse); + fclose(in); + } + } + } + + return 0; +} diff --git a/normalize.cpp b/normalize.cpp new file mode 100644 index 0000000..f062c22 --- /dev/null +++ b/normalize.cpp @@ -0,0 +1,211 @@ +#include +#include +#include +#include +#include +#include +#include +#include "header.hpp" +#include "serial.hpp" +#include "merge.hpp" + +bool quiet = false; + +void usage(char **argv) { + fprintf(stderr, "Usage: %s -s binsize -o out.count in.count ...\n", argv[0]); +} + +long long find_max(unsigned char *map, size_t size, int zoom) { + long long max = 0; + unsigned long long mask = 0; + unsigned long long curindex = 0; + long long curcount = 0; + int oprogress = 0; + + if (zoom != 0) { + mask = 0xFFFFFFFFFFFFFFFFULL << (64 - 2 * zoom); + } + + for (size_t i = 0; i < size; i += RECORD_BYTES) { + unsigned long long index = read64(map + i) & mask; + unsigned long long count = read32(map + i + INDEX_BYTES); + + int progress = 100 * i / size; + if (progress != oprogress) { + fprintf(stderr, "Find max: %d%%\r", progress); + oprogress = progress; + } + + if (index != curindex) { + if (curcount > max) { + max = curcount; + } + + curindex = index; + curcount = 0; + } + + curcount += count; + } + + return max; +} + +void normalize1(unsigned char *map, int zoom, FILE *out, size_t start, size_t end, unsigned long long base, unsigned long long max) { + for (size_t i = start; i < end; i += RECORD_BYTES) { + unsigned long long index = read64(map + i); + unsigned long long count = read32(map + i + INDEX_BYTES); + + count *= (double) max / base; + + write64(out, index); + write32(out, count); + } +} + +void normalize(unsigned char *map, size_t size, int zoom, FILE *out, unsigned long long max) { + unsigned long long mask = 0; + int oprogress = 0; + + size_t start = 0; + unsigned long long starti = 0; + + size_t end = 0; + unsigned long long endi = 0; + + unsigned long long bincount = 0; + unsigned long long binwidth = 1 << (64 - 2 * zoom); + + if (zoom != 0) { + mask = 0xFFFFFFFFFFFFFFFFULL << (64 - 2 * zoom); + } + + for (size_t i = 0; i < size; i += RECORD_BYTES) { + unsigned long long index = read64(map + i); + unsigned long long count = read32(map + i + INDEX_BYTES); + + int progress = 100 * i / size; + if (progress != oprogress) { + fprintf(stderr, "Normalize: %d%%\r", progress); + oprogress = progress; + } + + unsigned long long want_starti = 0; + if (index > binwidth) { + want_starti = index - binwidth; + } + unsigned long long want_endi = 0xFFFFFFFFFFFFFFFFULL; + if (index < want_endi - binwidth) { + want_endi = index + binwidth; + } + + while (start < size && starti < want_starti) { + unsigned long long ii = read64(map + start); + unsigned long long cc = read32(map + start + INDEX_BYTES); + + bincount -= cc; + start += RECORD_BYTES; + starti = ii; + } + while (end < size && endi < want_endi) { + unsigned long long ii = read64(map + end); + unsigned long long cc = read32(map + end + INDEX_BYTES); + + bincount += cc; + end += RECORD_BYTES; + endi = ii; + } + + // printf("%zu %llx (%llx %llx) %zu %llx %zu %llx %lld %lld wid %llx\n", i, index, want_starti, want_endi, start, starti, end, endi, count, bincount, binwidth); + + count *= (double) max / bincount; + + write64(out, index); + write32(out, count); + } +} + +int main(int argc, char **argv) { + extern int optind; + extern char *optarg; + + char *outfile = NULL; + int zoom = 32; + size_t cpus = sysconf(_SC_NPROCESSORS_ONLN); + + int i; + while ((i = getopt(argc, argv, "o:s:qp:")) != -1) { + switch (i) { + case 's': + zoom = atoi(optarg); + break; + + case 'o': + outfile = optarg; + break; + + case 'p': + cpus = atoi(optarg); + break; + + case 'q': + quiet = true; + break; + + default: + usage(argv); + exit(EXIT_FAILURE); + } + } + + if (argc - optind != 1) { + usage(argv); + exit(EXIT_FAILURE); + } + + int fd = open(argv[optind], O_RDONLY); + if (fd < 0) { + perror(argv[optind]); + exit(EXIT_FAILURE); + } + + struct stat st; + if (fstat(fd, &st) != 0) { + perror("stat"); + exit(EXIT_FAILURE); + } + + unsigned char *maps = (unsigned char *) mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + + if (st.st_size < HEADER_LEN || memcmp(maps, header_text, HEADER_LEN) != 0) { + fprintf(stderr, "%s:%s: Not a tile-count file\n", argv[0], argv[optind]); + exit(EXIT_FAILURE); + } + + if (close(fd) < 0) { + perror("close"); + exit(EXIT_FAILURE); + } + + long long max = find_max(maps + HEADER_LEN, st.st_size - HEADER_LEN, zoom); + + FILE *out = fopen(outfile, "wb"); + if (out == NULL) { + perror(outfile); + exit(EXIT_FAILURE); + } + + if (fwrite(header_text, HEADER_LEN, 1, out) != 1) { + perror("write header"); + exit(EXIT_FAILURE); + } + + normalize(maps + HEADER_LEN, st.st_size - HEADER_LEN, zoom, out, max); + + if (fclose(out) != 0) { + perror("fclose"); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/serial.cpp b/serial.cpp index 08cb457..c7763cf 100644 --- a/serial.cpp +++ b/serial.cpp @@ -38,7 +38,7 @@ void write32(unsigned char **out, unsigned long long v) { } } -unsigned long long read64(unsigned char *c) { +unsigned long long read64(const unsigned char *c) { unsigned long long out = 0; for (ssize_t i = 0; i < 8; i++) { @@ -48,7 +48,7 @@ unsigned long long read64(unsigned char *c) { return out; } -unsigned long long read32(unsigned char *c) { +unsigned long long read32(const unsigned char *c) { unsigned long long out = 0; for (ssize_t i = 0; i < 4; i++) { diff --git a/serial.hpp b/serial.hpp index 3f39afd..8fcacd6 100644 --- a/serial.hpp +++ b/serial.hpp @@ -2,5 +2,5 @@ void write64(FILE *out, unsigned long long v); void write64(unsigned char **out, unsigned long long v); void write32(FILE *out, unsigned long long v); void write32(unsigned char **out, unsigned long long v); -unsigned long long read64(unsigned char *c); -unsigned long long read32(unsigned char *c); +unsigned long long read64(const unsigned char *c); +unsigned long long read32(const unsigned char *c);