From b06e682bae6f04ad9147659edbd8eecb9dfe2d3f Mon Sep 17 00:00:00 2001 From: TANG ZhiXiong Date: Sat, 18 Nov 2023 22:02:20 +0800 Subject: [PATCH] index geobuf in protobuf format, supports id query, bbox query (#27) * some TODO * explicit header_size * add draft for geobuf_index proto * no default * geobuf index * not ready * add js decoding * fix * lint code * add attrs * not ready * fix * ready to release * fix --------- Co-authored-by: TANG ZHIXIONG --- .gitignore | 2 + Makefile | 3 + bin/index2json | 15 ++ docs/about/release-notes.md | 4 + geobuf_index.js | 53 +++++ geobuf_index.proto | 38 ++++ package.json | 32 ++++ pybind11_geobuf/__main__.py | 15 +- setup.py | 2 +- src/geobuf/geobuf.cpp | 17 +- src/geobuf/geobuf.hpp | 6 +- src/geobuf/geobuf_index.hpp | 373 +++++++++++++++++++++++------------- src/main.cpp | 35 +++- tests/test_geobuf.py | 21 +- 14 files changed, 466 insertions(+), 150 deletions(-) create mode 100755 bin/index2json create mode 100644 geobuf_index.js create mode 100644 geobuf_index.proto create mode 100644 package.json diff --git a/.gitignore b/.gitignore index 07f4000..38f8daf 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ benchmarks/*.pbf* data/suzhoubeizhan.json tests/*.json data/suzhoubeizhan.idx +package-lock.json +node_modules diff --git a/Makefile b/Makefile index 7f48cd5..f8f4d22 100644 --- a/Makefile +++ b/Makefile @@ -146,6 +146,9 @@ cli_test4: .PHONY: cli_test cli_test1 cli_test2 cli_test3 +geobuf_index.js: geobuf_index.proto + pbf $< > $@ + # conda create -y -n py36 python=3.6 # conda create -y -n py37 python=3.7 # conda create -y -n py38 python=3.8 diff --git a/bin/index2json b/bin/index2json new file mode 100755 index 0000000..d2f07c6 --- /dev/null +++ b/bin/index2json @@ -0,0 +1,15 @@ +#!/usr/bin/env node + +var Index = require('../geobuf_index.js').Index, + Pbf = require('pbf'), + fs = require('fs') + concat = require('concat-stream'); + +var input = process.stdin.isTTY ? fs.createReadStream(process.argv[2]) : process.stdin; + +input.pipe(concat(function(buf) { + var pbf = new Pbf(buf); + var obj = Index.read(pbf); + var data = JSON.stringify(obj); + process.stdout.write(Buffer.allocUnsafe ? Buffer.from(data) : new Buffer(data)); +})); diff --git a/docs/about/release-notes.md b/docs/about/release-notes.md index 81bf1ad..7bc19cd 100644 --- a/docs/about/release-notes.md +++ b/docs/about/release-notes.md @@ -10,6 +10,10 @@ To upgrade `pybind11-geobuf` to the latest version, use pip: pip install -U pybind11-geobuf ``` +## Version 0.2.0 (2023-11-18) + +* Indexing geobuf in protobuf format; Spec: geobuf_index.proto + ## Version 0.1.9 (2023-11-15) * Indexing geobuf (like flatgeobuf, but more general), making it random accessible diff --git a/geobuf_index.js b/geobuf_index.js new file mode 100644 index 0000000..aba100e --- /dev/null +++ b/geobuf_index.js @@ -0,0 +1,53 @@ +'use strict'; // code generated by pbf v3.2.1 + +// Index ======================================== + +var Index = exports.Index = {}; + +Index.read = function (pbf, end) { + return pbf.readFields(Index._readField, {header_size: 0, num_features: 0, offsets: [], fids: [], idxs: [], packed_rtree: null}, end); +}; +Index._readField = function (tag, obj, pbf) { + if (tag === 1) obj.header_size = pbf.readVarint(); + else if (tag === 2) obj.num_features = pbf.readVarint(); + else if (tag === 3) pbf.readPackedVarint(obj.offsets); + else if (tag === 4) obj.fids.push(pbf.readString()); + else if (tag === 5) pbf.readPackedVarint(obj.idxs); + else if (tag === 8) obj.packed_rtree = Index.PackedRTree.read(pbf, pbf.readVarint() + pbf.pos); +}; +Index.write = function (obj, pbf) { + if (obj.header_size) pbf.writeVarintField(1, obj.header_size); + if (obj.num_features) pbf.writeVarintField(2, obj.num_features); + if (obj.offsets) pbf.writePackedVarint(3, obj.offsets); + if (obj.fids) for (var i = 0; i < obj.fids.length; i++) pbf.writeStringField(4, obj.fids[i]); + if (obj.idxs) pbf.writePackedVarint(5, obj.idxs); + if (obj.packed_rtree) pbf.writeMessage(8, Index.PackedRTree.write, obj.packed_rtree); +}; + +// Index.PackedRTree ======================================== + +Index.PackedRTree = {}; + +Index.PackedRTree.read = function (pbf, end) { + return pbf.readFields(Index.PackedRTree._readField, {left: 0, bottom: 0, right: 0, top: 0, num_items: 0, num_nodes: 0, node_size: 0, serialized: null}, end); +}; +Index.PackedRTree._readField = function (tag, obj, pbf) { + if (tag === 1) obj.left = pbf.readDouble(); + else if (tag === 2) obj.bottom = pbf.readDouble(); + else if (tag === 3) obj.right = pbf.readDouble(); + else if (tag === 4) obj.top = pbf.readDouble(); + else if (tag === 5) obj.num_items = pbf.readVarint(); + else if (tag === 6) obj.num_nodes = pbf.readVarint(); + else if (tag === 7) obj.node_size = pbf.readVarint(); + else if (tag === 8) obj.serialized = pbf.readBytes(); +}; +Index.PackedRTree.write = function (obj, pbf) { + if (obj.left) pbf.writeDoubleField(1, obj.left); + if (obj.bottom) pbf.writeDoubleField(2, obj.bottom); + if (obj.right) pbf.writeDoubleField(3, obj.right); + if (obj.top) pbf.writeDoubleField(4, obj.top); + if (obj.num_items) pbf.writeVarintField(5, obj.num_items); + if (obj.num_nodes) pbf.writeVarintField(6, obj.num_nodes); + if (obj.node_size) pbf.writeVarintField(7, obj.node_size); + if (obj.serialized) pbf.writeBytesField(8, obj.serialized); +}; diff --git a/geobuf_index.proto b/geobuf_index.proto new file mode 100644 index 0000000..2a8cf30 --- /dev/null +++ b/geobuf_index.proto @@ -0,0 +1,38 @@ +option optimize_for = LITE_RUNTIME; + +message Index { + uint32 header_size = 1; // can parse Data[0:header_size] to init keys/dim/e + uint32 num_features = 2; + repeated uint64 offsets = 3 [packed = true]; // can parse Data[offsets[i]:offsets[i+1]] to get ith feature + // len(offsets) == num_features + 2, can parse data[offsets[-2]:offsets[-1]] to get feature_collection.custom_properties + + // feature id? + // - feature.id + // - feature.properties.id + // - feature.properties.fid + // - feature.properties.feature_id + // related docs: + // - https://docs.mapbox.com/mapbox-gl-js/api/map/#instance-members-feature-state + // - https://github.com/mapbox/mapbox-gl-js/pull/8987 + repeated string fids = 4; + repeated uint32 idxs = 5 [packed = true]; + + optional PackedRTree packed_rtree = 8; // spatial index + message PackedRTree { + double left = 1; + double bottom = 2; + double right = 3; + double top = 4; + uint32 num_items = 5; + uint32 num_nodes = 6; + uint32 node_size = 7; + bytes serialized = 8; + } + + // Tools + // - encoding + // python3 -m pybind11_geobuf index_geobuf path/to/geobuf.pbf output/geobuf.idx + // - inspect index + // python3 -m pybind11_geobuf pbf_decode geobuf.idx + // node bin/index2json geobuf.idx # need `npm i pbf concat-stream` +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..2bb0765 --- /dev/null +++ b/package.json @@ -0,0 +1,32 @@ +{ + "name": "geobuf_index", + "version": "0.0.1", + "description": "index for geobuf", + "main": "index.js", + "bin": { + "debug_index": "bin/json2geobuf" + }, + "scripts": {}, + "repository": { + "type": "git", + "url": "git@github.com:cubao/geobuf_cpp.git" + }, + "files": [ + "index.js", + "bin", + "geobuf_index.proto" + ], + "keywords": [ + "geobuf", + "index" + ], + "license": "ISC", + "bugs": { + "url": "https://github.com/cubao/geobuf_cpp/issues" + }, + "homepage": "https://github.com/cubao/geobuf_cpp", + "dependencies": { + "concat-stream": "^2.0.0", + "pbf": "^3.2.1" + } +} diff --git a/pybind11_geobuf/__main__.py b/pybind11_geobuf/__main__.py index ee7a8f2..5600274 100644 --- a/pybind11_geobuf/__main__.py +++ b/pybind11_geobuf/__main__.py @@ -210,12 +210,23 @@ def is_subset_of(path1: str, path2: str): assert is_subset_of_impl(path1, path2) -def index_geobuf(input_geobuf_path: str, output_index_path: str): +def index_geobuf( + input_geobuf_path: str, + output_index_path: str, + *, + feature_id: Optional[str] = "@", + packed_rtree: Optional[str] = "@", +): os.makedirs( os.path.dirname(os.path.abspath(output_index_path)), exist_ok=True, ) - return GeobufIndex.indexing(input_geobuf_path, output_index_path) + return GeobufIndex.indexing( + input_geobuf_path, + output_index_path, + feature_id=feature_id, + packed_rtree=packed_rtree, + ) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 524466a..e9a14ab 100644 --- a/setup.py +++ b/setup.py @@ -127,7 +127,7 @@ def build_extension(self, ext): # logic and declaration, and simpler if you include description/version in a file. setup( name="pybind11_geobuf", - version="0.1.9", + version="0.2.0", author="tzx", author_email="dvorak4tzx@gmail.com", url="https://geobuf-cpp.readthedocs.io", diff --git a/src/geobuf/geobuf.cpp b/src/geobuf/geobuf.cpp index acdf24b..946f933 100644 --- a/src/geobuf/geobuf.cpp +++ b/src/geobuf/geobuf.cpp @@ -16,7 +16,7 @@ #include #include -#include "spdlog/spdlog.h" +#include "spdlog/spdlog.h" // fmt::format // fix exposed macro 'GetObject' from wingdi.h (included by spdlog.h) under // windows, see https://github.com/Tencent/rapidjson/issues/1448 #ifdef GetObject @@ -698,10 +698,13 @@ mapbox::geojson::geojson Decoder::decode(const uint8_t *data, size_t size) const auto tag = pbf.tag(); if (tag == 1) { keys.push_back(pbf.get_string()); + header_size = pbf.data().data() - head; } else if (tag == 2) { dim = pbf.get_uint32(); + header_size = pbf.data().data() - head; } else if (tag == 3) { e = std::pow(10, pbf.get_uint32()); + header_size = pbf.data().data() - head; } else if (tag == 4) { protozero::pbf_reader pbf_fc = pbf.get_message(); return readFeatureCollection(pbf_fc); @@ -822,9 +825,9 @@ mapbox::geojson::feature_collection Decoder::readFeatureCollection(Pbf &pbf) mapbox::geojson::feature_collection fc; std::vector values; offsets.clear(); - int props_cursor = -1; + std::optional props_cursor; while (true) { - int cursor = pbf.data().data() - head; + uint64_t cursor = pbf.data().data() - head; if (!pbf.next()) { break; } @@ -835,7 +838,7 @@ mapbox::geojson::feature_collection Decoder::readFeatureCollection(Pbf &pbf) offsets.push_back(cursor); continue; } - if (props_cursor < 0) { + if (!props_cursor) { props_cursor = cursor; } if (tag == 13) { @@ -856,9 +859,9 @@ mapbox::geojson::feature_collection Decoder::readFeatureCollection(Pbf &pbf) } } // props start - int tail = pbf.data().data() - head; - if (props_cursor > 0 && !offsets.empty() && props_cursor > offsets.back()) { - offsets.push_back(props_cursor); + uint64_t tail = pbf.data().data() - head; + if (props_cursor && !offsets.empty() && *props_cursor > offsets.back()) { + offsets.push_back(*props_cursor); } else { offsets.push_back(tail); } diff --git a/src/geobuf/geobuf.hpp b/src/geobuf/geobuf.hpp index 01fbae5..75a8b66 100644 --- a/src/geobuf/geobuf.hpp +++ b/src/geobuf/geobuf.hpp @@ -196,7 +196,8 @@ struct Decoder int precision() const { return std::log10(e); } int __dim() const { return dim; } std::vector __keys() const { return keys; } - std::vector __offsets() const { return offsets; } + uint32_t __header_size() const { return header_size; } + std::vector __offsets() const { return offsets; } private: mapbox::geojson::feature_collection readFeatureCollection(Pbf &pbf); @@ -211,7 +212,8 @@ struct Decoder std::vector keys; const char *head = nullptr; - std::vector offsets; + uint32_t header_size = -1; + std::vector offsets; }; } // namespace geobuf diff --git a/src/geobuf/geobuf_index.hpp b/src/geobuf/geobuf_index.hpp index 89b08d2..3ce2fb0 100644 --- a/src/geobuf/geobuf_index.hpp +++ b/src/geobuf/geobuf_index.hpp @@ -19,102 +19,156 @@ namespace cubao { -inline std::string to_hex(const std::string &s, bool upper_case = true) +inline std::optional +value2string(const mapbox::geojson::value &value) { - std::ostringstream ret; + return value.match( + [](uint64_t v) { return std::to_string(v); }, + [](int64_t v) { return std::to_string(v); }, + [](const std::string &v) { return v; }, + [](const auto &) -> std::optional { return {}; }); +} - for (std::string::size_type i = 0; i < s.length(); ++i) { - int z = s[i] & 0xff; - ret << std::hex << std::setfill('0') << std::setw(2) - << (upper_case ? std::uppercase : std::nouppercase) << z; +inline std::optional +fn_feature_id_default(const mapbox::geojson::feature &feature) +{ + if (!feature.id.is()) { + return feature.id.match( + [](uint64_t v) { return std::to_string(v); }, + [](int64_t v) { return std::to_string(v); }, + [](const std::string &v) { return v; }, + [](const auto &) -> std::optional { return {}; }); } - - return ret.str(); + for (const auto &key : {"id", "feature_id", "fid"}) { + auto itr = feature.properties.find(key); + if (itr == feature.properties.end()) { + continue; + } + auto id = value2string(itr->second); + if (id) { + return id; + } + } + return {}; } struct GeobufIndex { GeobufIndex() = default; - int num_features = -1; - std::vector offsets; - FlatGeobuf::PackedRTree rtree; - mio::shared_ummap_source mmap; - mapbox::geobuf::Decoder decoder; - - bool init(const std::string &bytes) - { - int cursor = 10; - if (bytes.substr(0, cursor) != "GeobufIdx0") { - spdlog::error("invalid geobuf index"); - return false; - } - const uint8_t *data = reinterpret_cast(bytes.data()); - num_features = *reinterpret_cast(data + cursor); - cursor += sizeof(num_features); - spdlog::info("#features: {}", num_features); - - FlatGeobuf::NodeItem extent; - memcpy((void *)&extent.minX, data + cursor, sizeof(extent)); - cursor += sizeof(extent); - spdlog::info("extent: {},{},{},{}", extent.minX, extent.minY, - extent.maxX, extent.maxY); - - int num_items{0}; - num_items = *reinterpret_cast(data + cursor); - cursor += sizeof(num_items); - spdlog::info("num_items: {}", num_items); - int num_nodes{0}; - num_nodes = *reinterpret_cast(data + cursor); - cursor += sizeof(num_nodes); - spdlog::info("num_nodes: {}", num_nodes); + uint32_t header_size = 0; + uint32_t num_features = 0; + std::vector offsets; + std::optional> ids; + std::optional packed_rtree; - int node_size{0}; - node_size = *reinterpret_cast(data + cursor); - cursor += sizeof(node_size); - spdlog::info("node_size: {}", node_size); - - int tree_size{0}; - tree_size = *reinterpret_cast(data + cursor); - cursor += sizeof(tree_size); - spdlog::info("tree_size: {}", tree_size); - - rtree = FlatGeobuf::PackedRTree(data + cursor, num_items, node_size); - if (rtree.getNumNodes() != num_nodes || rtree.getExtent() != extent) { - spdlog::error("invalid rtree, #nodes:{} != {} (expected)", - rtree.getNumNodes(), num_nodes); - return false; - } - cursor += tree_size; + using Encoder = mapbox::geobuf::Encoder; + using Decoder = mapbox::geobuf::Decoder; + Decoder decoder; + mio::shared_ummap_source mmap; - int padding{0}; - padding = *reinterpret_cast(data + cursor); - cursor += sizeof(padding); - if (padding != 930604) { - spdlog::error("invalid padding: {} != 930604 (geobuf)", padding); - return false; + bool init(const uint8_t *data, size_t size) + { + auto pbf = protozero::pbf_reader{(const char *)data, size}; + std::vector fids; + std::vector idxs; + while (pbf.next()) { + const auto tag = pbf.tag(); + if (tag == 1) { + header_size = pbf.get_uint32(); + spdlog::info("header_size: {}", header_size); + } else if (tag == 2) { + num_features = pbf.get_uint32(); + spdlog::info("num_features: {}", num_features); + } else if (tag == 3) { + auto iter = pbf.get_packed_uint64(); + offsets = std::vector(iter.begin(), iter.end()); + if (offsets.size() == num_features + 2u) { + spdlog::info("#offsets: {}, values: [{},{}, ..., {}, {}]", + offsets.size(), offsets[0], offsets[1], + offsets[num_features], + offsets[num_features + 1]); + } else { + spdlog::error("#offsets:{} != 2 + num_features:{}", + offsets.size(), num_features); + } + } else if (tag == 4) { + fids.push_back(pbf.get_string()); + } else if (tag == 5) { + auto iter = pbf.get_packed_uint32(); + idxs = std::vector(iter.begin(), iter.end()); + } else if (tag == 8) { + FlatGeobuf::NodeItem extent; + uint32_t num_items{0}; + uint32_t num_nodes{0}; + uint32_t node_size{0}; + std::string rtree_bytes; + protozero::pbf_reader pbf_rtree = pbf.get_message(); + while (pbf_rtree.next()) { + const auto tag = pbf_rtree.tag(); + if (tag == 1) { + extent.minX = pbf_rtree.get_double(); + } else if (tag == 2) { + extent.minY = pbf_rtree.get_double(); + } else if (tag == 3) { + extent.maxX = pbf_rtree.get_double(); + } else if (tag == 4) { + extent.maxY = pbf_rtree.get_double(); + } else if (tag == 5) { + num_items = pbf_rtree.get_uint32(); + } else if (tag == 6) { + num_nodes = pbf_rtree.get_uint32(); + } else if (tag == 7) { + node_size = pbf_rtree.get_uint32(); + } else if (tag == 8) { + rtree_bytes = pbf_rtree.get_bytes(); + } else { + pbf_rtree.skip(); + } + } + spdlog::info("PackedRTree num_items={}, num_nodes={}, " + "node_size={}, bbox=[{},{},{},{}], #bytes={}", + num_items, num_nodes, node_size, extent.minX, + extent.minY, extent.maxX, extent.maxY, + rtree_bytes.size()); + if (num_items > 0 && num_nodes > 0 && node_size > 0 && + extent.width() >= 0 && extent.height() >= 0 && + rtree_bytes.size() == + num_nodes * sizeof(FlatGeobuf::NodeItem)) { + packed_rtree = FlatGeobuf::PackedRTree( + (const uint8_t *)rtree_bytes.data(), num_items, + node_size); + if (packed_rtree->getExtent() != extent) { + extent = packed_rtree->getExtent(); + spdlog::error( + "extent mismatch, from RTree: {},{},{},{}", + extent.minX, extent.minY, extent.maxX, extent.maxY); + } + } else { + spdlog::error("invalid PackedRTree"); + } + } else { + pbf.skip(); + } } - - int num_offsets{0}; - num_offsets = *reinterpret_cast(data + cursor); - cursor += sizeof(num_offsets); - spdlog::info("num_offsets: {}", num_offsets); - - offsets.resize(num_offsets); - memcpy(reinterpret_cast(offsets.data()), data + cursor, - sizeof(offsets[0]) * num_offsets); - cursor += sizeof(offsets[0]) * num_offsets; - spdlog::info("offsets: [{}, ..., {}]", offsets.front(), offsets.back()); - - padding = *reinterpret_cast(data + cursor); - cursor += sizeof(padding); - if (padding != 930604) { - spdlog::error("invalid padding: {} != 930604 (geobuf)", padding); - return false; + if (fids.size() != idxs.size()) { + spdlog::error("bad feature ids, #fids:{} != #idxs:{}", fids.size(), + idxs.size()); + } else { + ids = std::unordered_map{}; + for (size_t i = 0, N = idxs.size(); i < N; ++i) { + ids->emplace(fids[i], idxs[i]); + } + spdlog::info("#feature_ids: {}", ids->size()); } return true; } + bool init(const std::string &bytes) + { + return init((const uint8_t *)bytes.data(), bytes.size()); + } + bool mmap_init(const std::string &index_path, const std::string &geobuf_path) { @@ -127,12 +181,14 @@ struct GeobufIndex } bool mmap_init(const std::string &geobuf_path) { - if (num_features < 0 || offsets.empty()) { + if (offsets.size() != num_features + 2u || header_size == 0) { throw std::invalid_argument("should init index first!!!"); } - spdlog::info("lazy decoding geobuf with mmap"); + spdlog::info( + "decoding geobuf with mmap, only parse {} bytes header for now", + header_size); mmap = std::make_shared(geobuf_path); - decoder.decode_header(mmap.data(), offsets[0]); + decoder.decode_header(mmap.data(), header_size); spdlog::info("decoded geobuf header, #keys={}, dim={}, precision: {}", decoder.__keys().size(), decoder.__dim(), decoder.precision()); @@ -162,11 +218,10 @@ struct GeobufIndex only_geometry, only_properties); } std::optional - decode_feature(int index, bool only_geometry = false, + decode_feature(uint32_t index, bool only_geometry = false, bool only_properties = false) { - bool valid_index = - 0 <= index && index < num_features && index + 1 < offsets.size(); + bool valid_index = index < num_features && index + 1 < offsets.size(); if (!valid_index) { return {}; } @@ -183,6 +238,20 @@ struct GeobufIndex return {}; } + std::optional + decode_feature_of_id(const std::string &id, bool only_geometry = false, + bool only_properties = false) + { + if (!ids) { + return {}; + } + auto itr = ids->find(id); + if (itr == ids->end()) { + return {}; + } + return decode_feature(itr->second, only_geometry, only_properties); + } + mapbox::geojson::feature_collection decode_features(const uint8_t *data, const std::vector> &index, @@ -224,23 +293,38 @@ struct GeobufIndex } mapbox::feature::value decode_non_features() { - if (num_features <= 0 || offsets.size() < num_features + 2) { + if (num_features <= 0 || offsets.size() < num_features + 2u) { return {}; } try { - int cursor = offsets[num_features]; - int length = offsets[num_features + 1] - cursor; - if (length <= 0 || !mmap.is_open()) { + size_t begin = offsets[num_features]; + size_t end = offsets[num_features + 1u]; + if (begin >= end || !mmap.is_open()) { return {}; } - return decode_non_features(mmap.data() + cursor, length); + return decode_non_features(mmap.data() + begin, end - begin); } catch (...) { } return {}; } + std::set query(const Eigen::Vector2d &min, + const Eigen::Vector2d &max) const + { + if (!packed_rtree) { + return {}; + } + std::set hits; + for (auto h : packed_rtree->search(min[0], min[1], max[0], max[1])) { + hits.insert(h.offset); + } + return hits; + } + static bool indexing(const std::string &input_geobuf_path, - const std::string &output_index_path) + const std::string &output_index_path, + const std::optional feature_id = "@", + const std::optional packed_rtree = "@") { spdlog::info("indexing {} ...", input_geobuf_path); auto decoder = mapbox::geobuf::Decoder(); @@ -250,51 +334,74 @@ struct GeobufIndex "invalid GeoJSON type, should be FeatureCollection"); } auto &fc = geojson.get(); - - FILE *fp = fopen(output_index_path.c_str(), "wb"); - if (!fp) { - spdlog::error("failed to open {} for writing", output_index_path); - return {}; + auto header_size = decoder.__header_size(); + auto offsets = decoder.__offsets(); + spdlog::info("#features: {}", fc.size()); + spdlog::info("header_size: {}", header_size); + if (offsets.size() == fc.size() + 2u) { + spdlog::info("#offsets: {}, values: [{},{}, ..., {}, {}]", + offsets.size(), offsets[0], offsets[1], + offsets[fc.size()], offsets[fc.size() + 1]); + } else { + spdlog::error("#offsets:{} != 2 + num_features:{}", offsets.size(), + fc.size()); } - auto planet = Planet(fc); - // magic - fwrite("GeobufIdx0", 10, 1, fp); - int num_features = fc.size(); - // #features - fwrite(&num_features, sizeof(num_features), 1, fp); - auto rtree = planet.packed_rtree(); - auto extent = rtree.getExtent(); - // extent/bbox - fwrite(&extent, sizeof(extent), 1, fp); - // num_items - int num_items = rtree.getNumItems(); - fwrite(&num_items, sizeof(num_items), 1, fp); - // num_nodes - int num_nodes = rtree.getNumNodes(); - fwrite(&num_nodes, sizeof(num_nodes), 1, fp); - // node_size - int node_size = rtree.getNodeSize(); - fwrite(&node_size, sizeof(node_size), 1, fp); - // tree_size - int tree_size = rtree.size(); - fwrite(&tree_size, sizeof(tree_size), 1, fp); - // tree_bytes - rtree.streamWrite([&](const uint8_t *data, size_t size) { - fwrite(data, 1, size, fp); - }); + std::string data; + Encoder::Pbf pbf{data}; + pbf.add_uint32(1, header_size); + pbf.add_uint32(2, fc.size()); + pbf.add_packed_uint64(3, offsets.begin(), offsets.end()); - const int padding = 930604; // geobuf - fwrite(&padding, sizeof(padding), 1, fp); + if (feature_id) { + std::map ids; + for (size_t i = 0; i < fc.size(); ++i) { + auto id = fn_feature_id_default(fc[i]); + if (id) { + ids.emplace(*id, static_cast(i)); + } + } + if (!ids.empty()) { + spdlog::info("#feature_ids: {}", ids.size()); + std::vector idxs; + idxs.reserve(ids.size()); + for (auto &pair : ids) { + pbf.add_string(4, pair.first); + idxs.push_back(pair.second); + } + pbf.add_packed_uint32(5, idxs.begin(), idxs.end()); + } + } - std::vector offsets = decoder.__offsets(); - int num_offsets = offsets.size(); - fwrite(&num_offsets, sizeof(num_offsets), 1, fp); - fwrite(offsets.data(), sizeof(offsets[0]), offsets.size(), fp); - fwrite(&padding, sizeof(padding), 1, fp); - fclose(fp); - spdlog::info("wrote index to {}", output_index_path); - return true; + if (packed_rtree) { + auto planet = Planet(fc); + if (*packed_rtree == "per_line_segment") { + planet.build(true); + } + auto rtree = planet.packed_rtree(); + auto extent = rtree.getExtent(); + spdlog::info("PackedRTree num_items={}, num_nodes={}, " + "node_size={}, bbox=[{},{},{},{}], #bytes={}", + rtree.getNumItems(), rtree.getNumNodes(), + rtree.getNodeSize(), extent.minX, extent.minY, + extent.maxX, extent.maxY, rtree.size()); + if (extent.width() >= 0 && extent.height() >= 0) { + protozero::pbf_writer pbf_rtree{pbf, 8}; + pbf_rtree.add_double(1, extent.minX); + pbf_rtree.add_double(2, extent.minY); + pbf_rtree.add_double(3, extent.maxX); + pbf_rtree.add_double(4, extent.maxY); + pbf_rtree.add_uint32(5, rtree.getNumItems()); + pbf_rtree.add_uint32(6, rtree.getNumNodes()); + pbf_rtree.add_uint32(7, rtree.getNodeSize()); + rtree.streamWrite([&](const uint8_t *data, size_t size) { + pbf_rtree.add_bytes(8, (const char *)data, size); + }); + } else { + spdlog::error("invalid PackedRTree"); + } + } + return mapbox::geobuf::dump_bytes(output_index_path, data); } }; } // namespace cubao diff --git a/src/main.cpp b/src/main.cpp index ce00ed8..f9f080a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -292,7 +292,20 @@ PYBIND11_MODULE(_pybind11_geobuf, m) using GeobufIndex = cubao::GeobufIndex; py::class_(m, "GeobufIndex", py::module_local()) // .def(py::init<>()) - .def("init", &GeobufIndex::init, "index_bytes"_a) + // attrs + .def_property_readonly( + "header_size", + [](const GeobufIndex &self) { return self.header_size; }) + .def_property_readonly( + "num_features", + [](const GeobufIndex &self) { return self.num_features; }) + .def_property_readonly( + "offsets", [](const GeobufIndex &self) { return self.offsets; }) + .def_property_readonly("ids", + [](const GeobufIndex &self) { return self.ids; }) + // + .def("init", py::overload_cast(&GeobufIndex::init), + "index_bytes"_a) // .def("mmap_init", py::overload_cast( @@ -315,7 +328,8 @@ PYBIND11_MODULE(_pybind11_geobuf, m) "offset"_a, "length"_a) // .def("decode_feature", - py::overload_cast(&GeobufIndex::decode_feature), + py::overload_cast( + &GeobufIndex::decode_feature), "index"_a, py::kw_only(), "only_geometry"_a = false, "only_properties"_a = false) .def("decode_feature", @@ -323,6 +337,11 @@ PYBIND11_MODULE(_pybind11_geobuf, m) &GeobufIndex::decode_feature), "bytes"_a, py::kw_only(), "only_geometry"_a = false, "only_properties"_a = false) + .def("decode_feature_of_id", + py::overload_cast( + &GeobufIndex::decode_feature), + "id"_a, py::kw_only(), "only_geometry"_a = false, + "only_properties"_a = false) .def("decode_features", py::overload_cast &, bool, bool>( &GeobufIndex::decode_features), @@ -335,9 +354,17 @@ PYBIND11_MODULE(_pybind11_geobuf, m) "bytes"_a) .def("decode_non_features", py::overload_cast<>(&GeobufIndex::decode_non_features)) + .def( + "query", + py::overload_cast( + &GeobufIndex::query, py::const_)) // - .def_static("indexing", &GeobufIndex::indexing, "input_geobuf_path"_a, - "output_index_path"_a) + .def_static("indexing", &GeobufIndex::indexing, // + "input_geobuf_path"_a, // + "output_index_path"_a, // + py::kw_only(), // + "feature_id"_a = "@", // + "packed_rtree"_a = "@") // ; diff --git a/tests/test_geobuf.py b/tests/test_geobuf.py index c0395a9..d1fe49d 100644 --- a/tests/test_geobuf.py +++ b/tests/test_geobuf.py @@ -1920,9 +1920,22 @@ def test_geobuf_index(): build_dir = os.path.abspath(f"{__pwd}/../build") opath = f"{build_dir}/suzhoubeizhan.idx" - assert GeobufIndex.indexing(ipath, opath) indexer = GeobufIndex() + assert indexer.header_size == indexer.num_features == 0 + assert not indexer.offsets + assert indexer.ids is None + + assert GeobufIndex.indexing(ipath, opath) + + # node bin/index2json build/suzhoubeizhan.idx > idx.json assert indexer.mmap_init(opath, ipath) + assert indexer.header_size == 52 + assert indexer.num_features == 1016 + offsets = indexer.offsets + assert len(offsets) == 1018 + assert offsets[:5] == [56, 318, 817, 999, 1174] + assert indexer.ids["24"] == 0 + expected = { "type": "Feature", "geometry": { @@ -2010,6 +2023,12 @@ def test_geobuf_index(): assert indexer.decode_non_features() assert indexer.decode_non_features()() == expected_custom_props + hits = indexer.query([120.64094, 31.41515], [120.64137, 31.41534]) + assert len(hits) == 4 + hits = [indexer.decode_feature(h) for h in hits] + hits = [h.properties()["id"]() for h in hits] + assert hits == ["943", "936", "937", "1174"] + if __name__ == "__main__": np.set_printoptions(suppress=True)