Skip to content

Commit

Permalink
Merge pull request #473 from mapbox/json-join
Browse files Browse the repository at this point in the history
Working on tools for joining CSV data to GeoJSON
  • Loading branch information
e-n-f authored Nov 21, 2017
2 parents 4754084 + 9a12a76 commit ac67013
Show file tree
Hide file tree
Showing 14 changed files with 993 additions and 235 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 1.27.0

* Add tippecanoe-json-tool for sorting and joining GeoJSON files
* Fix problem where --detect-shared-borders could simplify polygons away
* Attach --coalesce-smallest-as-needed leftovers to the last feature, not the first
* Fix overflow when iterating through 0-length lists backwards

## 1.26.7

* Add an option to quiet the progress indicator but not warnings
Expand Down
21 changes: 14 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,22 @@ else
FINAL_FLAGS := -g $(WARNING_FLAGS) $(DEBUG_FLAGS)
endif

all: tippecanoe tippecanoe-enumerate tippecanoe-decode tile-join unit geojson2nd
all: tippecanoe tippecanoe-enumerate tippecanoe-decode tile-join unit tippecanoe-json-tool

docs: man/tippecanoe.1

install: tippecanoe tippecanoe-enumerate tippecanoe-decode tile-join
install: tippecanoe tippecanoe-enumerate tippecanoe-decode tile-join tippecanoe-json-tool
mkdir -p $(PREFIX)/bin
mkdir -p $(MANDIR)
cp tippecanoe $(PREFIX)/bin/tippecanoe
cp tippecanoe-enumerate $(PREFIX)/bin/tippecanoe-enumerate
cp tippecanoe-decode $(PREFIX)/bin/tippecanoe-decode
cp tippecanoe-json-tool $(PREFIX)/bin/tippecanoe-json-tool
cp tile-join $(PREFIX)/bin/tile-join
cp man/tippecanoe.1 $(MANDIR)/tippecanoe.1

uninstall:
rm $(PREFIX)/bin/tippecanoe $(PREFIX)/bin/tippecanoe-enumerate $(PREFIX)/bin/tippecanoe-decode $(PREFIX)/bin/tile-join $(MANDIR)/tippecanoe.1
rm $(PREFIX)/bin/tippecanoe $(PREFIX)/bin/tippecanoe-enumerate $(PREFIX)/bin/tippecanoe-decode $(PREFIX)/bin/tile-join $(MANDIR)/tippecanoe.1 $(PREFIX)/bin/tippecanoe-json-tool

man/tippecanoe.1: README.md
md2man-roff README.md > man/tippecanoe.1
Expand All @@ -58,7 +59,7 @@ tippecanoe-decode: decode.o projection.o mvt.o write_json.o text.o
tile-join: tile-join.o projection.o pool.o mbtiles.o mvt.o memfile.o dirtiles.o jsonpull/jsonpull.o text.o evaluator.o csv.o
$(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread

geojson2nd: geojson2nd.o jsonpull/jsonpull.o
tippecanoe-json-tool: jsontool.o jsonpull/jsonpull.o csv.o
$(CXX) $(PG) $(LIBS) $(FINAL_FLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) -lm -lz -lsqlite3 -lpthread

unit: unit.o text.o
Expand All @@ -81,7 +82,7 @@ indent:
TESTS = $(wildcard tests/*/out/*.json)
SPACE = $(NULL) $(NULL)

test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit
test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit json-tool-test
./unit

# Work around Makefile and filename punctuation limits: _ for space, @ for :, % for /
Expand All @@ -92,14 +93,14 @@ test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test p
rm $@.out $@.mbtiles

# Don't test overflow with geobuf, because it fails (https://github.com/mapbox/geobuf/issues/87)
geobuf-test: geojson2nd $(addsuffix .checkbuf,$(filter-out tests/overflow/out/-z0.json,$(TESTS)))
geobuf-test: tippecanoe-json-tool $(addsuffix .checkbuf,$(filter-out tests/overflow/out/-z0.json,$(TESTS)))

# For quicker address sanitizer build, hope that regular JSON parsing is tested enough by parallel and join tests
fewer-tests: tippecanoe tippecanoe-decode geobuf-test raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit

# XXX Use proper makefile rules instead of a for loop
%.json.checkbuf:
for i in $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json); do ./geojson2nd -w $$i | ./node_modules/geobuf/bin/json2geobuf > $$i.geobuf; done
for i in $(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json); do ./tippecanoe-json-tool -w $$i | ./node_modules/geobuf/bin/json2geobuf > $$i.geobuf; done
./tippecanoe -aD -f -o $@.mbtiles $(subst @,:,$(subst %,/,$(subst _, ,$(patsubst %.json.checkbuf,%,$(word 4,$(subst /, ,$@)))))) $(addsuffix .geobuf,$(wildcard $(subst $(SPACE),/,$(wordlist 1,2,$(subst /, ,$@)))/*.json)) < /dev/null
./tippecanoe-decode $@.mbtiles | sed 's/checkbuf/check/g' > $@.out
cmp $@.out $(patsubst %.checkbuf,%,$@)
Expand Down Expand Up @@ -232,6 +233,12 @@ join-filter-test:
cmp tests/feature-filter/out/filtered.json.check tests/feature-filter/out/filtered.json.standard
rm -f tests/feature-filter/out/filtered.json.check tests/feature-filter/out/filtered.mbtiles tests/feature-filter/out/all.mbtiles

json-tool-test: tippecanoe-json-tool
./tippecanoe-json-tool -e GEOID10 tests/join-population/tabblock_06001420.json | sort > tests/join-population/tabblock_06001420.json.sort
./tippecanoe-json-tool -c tests/join-population/population.csv tests/join-population/tabblock_06001420.json.sort > tests/join-population/tabblock_06001420.json.sort.joined
cmp tests/join-population/tabblock_06001420.json.sort.joined tests/join-population/tabblock_06001420.json.sort.joined.standard
rm -f tests/join-population/tabblock_06001420.json.sort tests/join-population/tabblock_06001420.json.sort.joined

# Use this target to regenerate the standards that the tests are compared against
# after making a change that legitimately changes their output

Expand Down
62 changes: 62 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -595,3 +595,65 @@ resolutions.
* `-c` or `--tag-layer-and-zoom`: Include each feature's layer and zoom level as part of its `tippecanoe` object rather than as a FeatureCollection wrapper
* `-S` or `--stats`: Just report statistics about each tile's size and the number of features in it, as a JSON structure.
* `-f` or `--force`: Decode tiles even if polygon ring order or closure problems are detected

tippecanoe-json-tool
====================

Extracts GeoJSON features or standalone geometries as line-delimited JSON objects from a larger JSON file,
following the same extraction rules that Tippecanoe uses when parsing JSON.

tippecanoe-json-tool file.json [... file.json]

Optionally also wraps them in a FeatureCollection or GeometryCollection as appropriate.

Optionally extracts an attribute from the GeoJSON `properties` for sorting.

Optionally joins a sorted CSV of new attributes to a sorted GeoJSON file.

The reason for requiring sorting is so that it is possible to work on CSV and GeoJSON files that are larger
than can comfortably fit in memory by streaming through them in parallel, in the same way that the Unix
`join` command does. The Unix `sort` command can be used to sort large files to prepare them for joining.

The sorting interface is weird, and future version of `tippecanoe-json-tool` will replace it with
something better.

### Options

* `-w` or `--wrap`: Add the FeatureCollection or GeometryCollection wrapper.
* `-e` *attribute* or `--extract=`*attribute*: Extract the named attribute as a prefix to each feature.
The formatting makes excessive use of `\u` quoting so that it follows JSON string rules but will still
be sorted correctly by tools that just do ASCII comparisons.
* `-c` *file.csv* or `--csv=`*file.csv*: Join properties from the named sorted CSV file, using its first column as the join key. Geometries will be passed through even if they do not match the CSV; CSV lines that do not match a geometry will be discarded.

### Example

Join Census LEHD ([Longitudinal Employer-Household Dynamics](https://lehd.ces.census.gov/)) employment data to a file of Census block geography
for Tippecanoe County, Indiana.

Download Census block geometry, and convert to GeoJSON:

```
$ curl -L -O https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_18157_tabblock10.zip
$ unzip tl_2010_18157_tabblock10.zip
$ ogr2ogr -f GeoJSON tl_2010_18157_tabblock10.json tl_2010_18157_tabblock10.shp
```

Download Indiana employment data, and fix name of join key in header

```
$ curl -L -O https://lehd.ces.census.gov/data/lodes/LODES7/in/wac/in_wac_S000_JT00_2015.csv.gz
$ gzip -dc in_wac_S000_JT00_2015.csv.gz | sed '1s/w_geocode/GEOID10/' > in_wac_S000_JT00_2015.csv
```

Sort GeoJSON block geometry so it is ordered by block ID. If you don't do this, you will get a
"GeoJSON file is out of sort" error.

```
$ tippecanoe-json-tool -e GEOID10 tl_2010_18157_tabblock10.json | LC_ALL=C sort > tl_2010_18157_tabblock10.sort.json
```

Join block geometries to employment properties:

```
$ tippecanoe-json-tool -c in_wac_S000_JT00_2015.csv tl_2010_18157_tabblock10.sort.json > blocks-wac.json
```
8 changes: 4 additions & 4 deletions csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ std::string csv_dequote(std::string s) {
return out;
}

std::string getline(FILE *f) {
std::string csv_getline(FILE *f) {
std::string out;
int c;
while ((c = getc(f)) != EOF) {
Expand All @@ -58,22 +58,22 @@ std::string getline(FILE *f) {
return out;
}

void readcsv(char *fn, std::vector<std::string> &header, std::map<std::string, std::vector<std::string>> &mapping) {
void readcsv(const char *fn, std::vector<std::string> &header, std::map<std::string, std::vector<std::string>> &mapping) {
FILE *f = fopen(fn, "r");
if (f == NULL) {
perror(fn);
exit(EXIT_FAILURE);
}

std::string s;
if ((s = getline(f)).size() > 0) {
if ((s = csv_getline(f)).size() > 0) {
header = csv_split(s.c_str());

for (size_t i = 0; i < header.size(); i++) {
header[i] = csv_dequote(header[i]);
}
}
while ((s = getline(f)).size() > 0) {
while ((s = csv_getline(f)).size() > 0) {
std::vector<std::string> line = csv_split(s.c_str());
if (line.size() > 0) {
line[0] = csv_dequote(line[0]);
Expand Down
4 changes: 2 additions & 2 deletions csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
#include <string>
#include <map>

std::vector<std::string> csv_split(char *s);
std::vector<std::string> csv_split(const char *s);
std::string csv_dequote(std::string s);
void readcsv(char *fn, std::vector<std::string> &header, std::map<std::string, std::vector<std::string>> &mapping);
void readcsv(const char *fn, std::vector<std::string> &header, std::map<std::string, std::vector<std::string>> &mapping);
std::string csv_getline(FILE *f);
bool is_number(std::string const &s);

Expand Down
153 changes: 0 additions & 153 deletions geojson2nd.cpp

This file was deleted.

Loading

0 comments on commit ac67013

Please sign in to comment.