rustdoc-search: stringdex 0.0.2

Two index format tweaks that reduce the size of the standard
library, compiler, and wordnet dictionary when I test it.

(cherry picked from commit fdeb3633d9)
This commit is contained in:
Michael Howell
2025-10-13 21:12:21 -07:00
committed by Josh Stone
parent 716b84ed5b
commit 7ce30224e3
3 changed files with 314 additions and 47 deletions

View File

@@ -5216,9 +5216,9 @@ dependencies = [
[[package]]
name = "stringdex"
version = "0.0.1-alpha10"
version = "0.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa846a7d509d1828a4f90962dc09810e161abcada7fc6a921e92c168d0811d7"
checksum = "18b3bd4f10d15ef859c40291769f0d85209de6b0f1c30713ff9cdf45ac43ea36"
dependencies = [
"stacker",
]

View File

@@ -21,7 +21,7 @@ rustdoc-json-types = { path = "../rustdoc-json-types" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
smallvec = "1.8.1"
stringdex = { version = "0.0.1-alpha10" }
stringdex = "=0.0.2"
tempfile = "3"
threadpool = "1.8.1"
tracing = "0.1"

View File

@@ -1447,7 +1447,7 @@ function loadDatabase(hooks) {
makeSearchTreeBranchesAlphaBitmapClass(LONG_ALPHABITMAP_CHARS, 4);
/**
* @typedef {PrefixSearchTree|SuffixSearchTree} SearchTree
* @typedef {PrefixSearchTree|SuffixSearchTree|InlineNeighborsTree} SearchTree
* @typedef {PrefixTrie|SuffixTrie} Trie
*/
@@ -1675,9 +1675,12 @@ function loadDatabase(hooks) {
yield leaves;
}
}
/** @type {HashTable<[number, SearchTree][]>} */
/** @type {HashTable<[number, PrefixSearchTree|SuffixSearchTree][]>} */
const subnodes = new HashTable();
for await (const node of current_layer) {
for await (const nodeEncoded of current_layer) {
const node = nodeEncoded instanceof InlineNeighborsTree ?
nodeEncoded.decode() :
nodeEncoded;
const branches = node.branches;
const l = branches.subtrees.length;
for (let i = 0; i < l; ++i) {
@@ -1741,7 +1744,10 @@ function loadDatabase(hooks) {
// we then yield the smallest ones (can't yield bigger ones
// if we want to do them in order)
for (const {node, len} of current_layer) {
const tree = await node;
const treeEncoded = await node;
const tree = treeEncoded instanceof InlineNeighborsTree ?
treeEncoded.decode() :
treeEncoded;
if (!(tree instanceof PrefixSearchTree)) {
continue;
}
@@ -1804,7 +1810,10 @@ function loadDatabase(hooks) {
/** @type {HashTable<{byte: number, tree: PrefixSearchTree, len: number}[]>} */
const subnodes = new HashTable();
for await (const {node, len} of current_layer) {
const tree = await node;
const treeEncoded = await node;
const tree = treeEncoded instanceof InlineNeighborsTree ?
treeEncoded.decode() :
treeEncoded;
if (!(tree instanceof PrefixSearchTree)) {
continue;
}
@@ -2166,9 +2175,12 @@ function loadDatabase(hooks) {
yield leaves;
}
}
/** @type {HashTable<[number, SearchTree][]>} */
/** @type {HashTable<[number, PrefixSearchTree|SuffixSearchTree][]>} */
const subnodes = new HashTable();
for await (const node of current_layer) {
for await (const nodeEncoded of current_layer) {
const node = nodeEncoded instanceof InlineNeighborsTree ?
nodeEncoded.decode() :
nodeEncoded;
const branches = node.branches;
const l = branches.subtrees.length;
for (let i = 0; i < l; ++i) {
@@ -2264,6 +2276,174 @@ function loadDatabase(hooks) {
}
}
/**
* Represents a subtree where all transitive leaves
* have a shared 16bit prefix and there are no sub-branches.
*/
class InlineNeighborsTree {
/**
* @param {Uint8Array} encoded
* @param {number} start
*/
constructor(
encoded,
start,
) {
this.encoded = encoded;
this.start = start;
}
/**
* @return {PrefixSearchTree|SuffixSearchTree}
*/
decode() {
let i = this.start;
const encoded = this.encoded;
const has_branches = (encoded[i] & 0x04) !== 0;
/** @type {boolean} */
const is_suffixes_only = (encoded[i] & 0x01) !== 0;
let leaves_count = ((encoded[i] >> 4) & 0x0f) + 1;
i += 1;
let branch_count = 0;
if (has_branches) {
branch_count = encoded[i] + 1;
i += 1;
}
const dlen = encoded[i] & 0x3f;
if ((encoded[i] & 0x80) !== 0) {
leaves_count = 0;
}
i += 1;
let data = EMPTY_UINT8;
if (!is_suffixes_only && dlen !== 0) {
data = encoded.subarray(i, i + dlen);
i += dlen;
}
const leaf_value_upper = encoded[i] | (encoded[i + 1] << 8);
i += 2;
/** @type {Promise<SearchTree>[]} */
const branch_nodes = [];
for (let j = 0; j < branch_count; j += 1) {
const branch_dlen = encoded[i] & 0x0f;
const branch_leaves_count = ((encoded[i] >> 4) & 0x0f) + 1;
i += 1;
let branch_data = EMPTY_UINT8;
if (!is_suffixes_only && branch_dlen !== 0) {
branch_data = encoded.subarray(i, i + branch_dlen);
i += branch_dlen;
}
const branch_leaves = new RoaringBitmap(null);
branch_leaves.keysAndCardinalities = Uint8Array.of(
leaf_value_upper & 0xff,
(leaf_value_upper >> 8) & 0xff,
(branch_leaves_count - 1) & 0xff,
((branch_leaves_count - 1) >> 8) & 0xff,
);
branch_leaves.containers = [
new RoaringBitmapArray(
branch_leaves_count,
encoded.subarray(i, i + (branch_leaves_count * 2)),
),
];
i += branch_leaves_count * 2;
branch_nodes.push(Promise.resolve(
is_suffixes_only ?
new SuffixSearchTree(
EMPTY_SEARCH_TREE_BRANCHES,
branch_dlen,
branch_leaves,
) :
new PrefixSearchTree(
EMPTY_SEARCH_TREE_BRANCHES,
EMPTY_SEARCH_TREE_BRANCHES,
branch_data,
branch_leaves,
EMPTY_BITMAP,
),
));
}
/** @type {SearchTreeBranchesArray<SearchTree>} */
const branches = branch_count === 0 ?
EMPTY_SEARCH_TREE_BRANCHES :
new SearchTreeBranchesArray(
encoded.subarray(i, i + branch_count),
EMPTY_UINT8,
);
i += branch_count;
branches.subtrees = branch_nodes;
let leaves = EMPTY_BITMAP;
if (leaves_count !== 0) {
leaves = new RoaringBitmap(null);
leaves.keysAndCardinalities = Uint8Array.of(
leaf_value_upper & 0xff,
(leaf_value_upper >> 8) & 0xff,
(leaves_count - 1) & 0xff,
((leaves_count - 1) >> 8) & 0xff,
);
leaves.containers = [
new RoaringBitmapArray(
leaves_count,
encoded.subarray(i, i + (leaves_count * 2)),
),
];
i += leaves_count * 2;
}
return is_suffixes_only ?
new SuffixSearchTree(
branches,
dlen,
leaves,
) :
new PrefixSearchTree(
branches,
branches,
data,
leaves,
EMPTY_BITMAP,
);
}
/**
* Returns the Trie for the root node.
*
* A Trie pointer refers to a single node in a logical decompressed search tree
* (the real search tree is compressed).
*
* @param {DataColumn} dataColumn
* @param {Uint8ArraySearchPattern} searchPattern
* @return {Trie}
*/
trie(dataColumn, searchPattern) {
const tree = this.decode();
return tree instanceof SuffixSearchTree ?
new SuffixTrie(tree, 0, dataColumn, searchPattern) :
new PrefixTrie(tree, 0, dataColumn, searchPattern);
}
/**
* Return the trie representing `name`
* @param {Uint8Array|string} name
* @param {DataColumn} dataColumn
* @returns {Promise<Trie?>}
*/
search(name, dataColumn) {
return this.decode().search(name, dataColumn);
}
/**
* @param {Uint8Array|string} name
* @param {DataColumn} dataColumn
* @returns {AsyncGenerator<Trie>}
*/
searchLev(name, dataColumn) {
return this.decode().searchLev(name, dataColumn);
}
/** @returns {RoaringBitmap} */
getCurrentLeaves() {
return this.decode().getCurrentLeaves();
}
}
class DataColumn {
/**
* Construct the wrapper object for a data column.
@@ -2765,21 +2945,37 @@ function loadDatabase(hooks) {
// because that's the canonical, hashed version of the data
let compression_tag = input[i];
const is_pure_suffixes_only_node = (compression_tag & 0x01) !== 0;
const is_long_compressed = (compression_tag & 0x04) !== 0;
const is_data_compressed = (compression_tag & 0x08) !== 0;
i += 1;
if (is_long_compressed) {
compression_tag |= input[i] << 8;
i += 1;
}
/** @type {number} */
let dlen;
/** @type {number} */
let no_leaves_flag;
if (compression_tag > 1) {
// compressed node
const is_long_compressed = (compression_tag & 0x04) !== 0;
const is_data_compressed = (compression_tag & 0x08) !== 0;
i += 1;
if (is_long_compressed) {
compression_tag |= input[i] << 8;
i += 1;
compression_tag |= input[i] << 16;
i += 1;
}
let dlen = input[i] & 0x7F;
/** @type {number} */
let inline_neighbors_flag;
if (is_data_compressed && is_pure_suffixes_only_node) {
dlen = 0;
no_leaves_flag = 0x80;
inline_neighbors_flag = 0;
} else {
dlen = input[i] & 0x3F;
no_leaves_flag = input[i] & 0x80;
inline_neighbors_flag = input[i] & 0x40;
i += 1;
}
if (inline_neighbors_flag !== 0) {
// node with packed leaves and common 16bit prefix
const leaves_count = no_leaves_flag !== 0 ?
0 :
((compression_tag >> 4) & 0x0f) + 1;
const branch_count = is_long_compressed ?
((compression_tag >> 8) & 0xff) + 1 :
0;
if (is_data_compressed) {
data = data_history[data_history.length - dlen - 1];
dlen = data.length;
@@ -2791,6 +2987,72 @@ function loadDatabase(hooks) {
new Uint8Array(input.buffer, i + input.byteOffset, dlen);
i += dlen;
}
const branches_start = i;
// leaf_value_upper
i += 2;
// branch_nodes
for (let j = 0; j < branch_count; j += 1) {
const branch_dlen = input[i] & 0x0f;
const branch_leaves_count = ((input[i] >> 4) & 0x0f) + 1;
i += 1;
if (!is_pure_suffixes_only_node) {
i += branch_dlen;
}
i += branch_leaves_count * 2;
}
// branch keys
i += branch_count;
// leaves
i += leaves_count * 2;
if (is_data_compressed) {
const clen = (
1 + // first compression header byte
(is_long_compressed ? 1 : 0) + // branch count
1 + // data length and other flags
dlen + // data
(i - branches_start) // branches and leaves
);
const canonical = new Uint8Array(clen);
let ci = 0;
canonical[ci] = input[start] ^ 0x08;
ci += 1;
if (is_long_compressed) {
canonical[ci] = input[start + ci];
ci += 1;
}
canonical[ci] = dlen | no_leaves_flag | 0x40;
ci += 1;
for (let j = 0; j < dlen; j += 1) {
canonical[ci] = data[j];
ci += 1;
}
for (let j = branches_start; j < i; j += 1) {
canonical[ci] = input[j];
ci += 1;
}
tree = new InlineNeighborsTree(canonical, 0);
siphashOfBytes(canonical, 0, 0, 0, 0, hash);
} else {
tree = new InlineNeighborsTree(input, start);
siphashOfBytes(new Uint8Array(
input.buffer,
start + input.byteOffset,
i - start,
), 0, 0, 0, 0, hash);
}
} else if (compression_tag > 1) {
// compressed node
if (is_pure_suffixes_only_node) {
data = EMPTY_UINT8;
} else if (is_data_compressed) {
data = data_history[data_history.length - dlen - 1];
dlen = data.length;
} else {
data = dlen === 0 ?
EMPTY_UINT8 :
new Uint8Array(input.buffer, i + input.byteOffset, dlen);
i += dlen;
}
const coffset = i;
const {
cpbranches,
@@ -2820,19 +3082,27 @@ function loadDatabase(hooks) {
suffix,
);
const clen = (
3 + // lengths of children and data
// lengths of children and data
(is_data_compressed ? 2 : 3) +
// branches
csnodes.length +
csbranches.length +
// leaves
suffix.consumed_len_bytes
);
if (canonical.length < clen) {
canonical = new Uint8Array(clen);
}
let ci = 0;
canonical[ci] = 1;
ci += 1;
canonical[ci] = dlen | no_leaves_flag;
ci += 1;
if (is_data_compressed) {
canonical[ci] = 0x09;
ci += 1;
} else {
canonical[ci] = 1;
ci += 1;
canonical[ci] = dlen | no_leaves_flag;
ci += 1;
}
canonical[ci] = input[coffset]; // suffix child count
ci += 1;
canonical.set(csnodes, ci);
@@ -2901,13 +3171,8 @@ function loadDatabase(hooks) {
}
siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash);
}
hash[2] &= 0x7f;
} else {
i += 1;
// uncompressed node
const dlen = input[i] & 0x7F;
no_leaves_flag = input[i] & 0x80;
i += 1;
if (dlen === 0 || is_pure_suffixes_only_node) {
data = EMPTY_UINT8;
} else {
@@ -2946,7 +3211,6 @@ function loadDatabase(hooks) {
start + input.byteOffset,
i - start,
), 0, 0, 0, 0, hash);
hash[2] &= 0x7f;
tree = is_pure_suffixes_only_node ?
new SuffixSearchTree(
branches,
@@ -2961,30 +3225,33 @@ function loadDatabase(hooks) {
suffix,
);
}
hash[2] &= 0x7f;
hash_history.push({hash: truncatedHash.slice(), used: false});
if (data.length !== 0) {
data_history.push(data);
}
const tree_branch_nodeids = tree.branches.nodeids;
const tree_branch_subtrees = tree.branches.subtrees;
let j = 0;
let lb = tree.branches.subtrees.length;
while (j < lb) {
// node id with a 1 in its most significant bit is inlined, and, so
// it won't be in the stash
if ((tree_branch_nodeids[j * 6] & 0x80) === 0) {
const subtree = stash.getWithOffsetKey(tree_branch_nodeids, j * 6);
if (subtree !== undefined) {
tree_branch_subtrees[j] = Promise.resolve(subtree);
if (!(tree instanceof InlineNeighborsTree)) {
const tree_branch_nodeids = tree.branches.nodeids;
const tree_branch_subtrees = tree.branches.subtrees;
let j = 0;
const lb = tree.branches.subtrees.length;
while (j < lb) {
// node id with a 1 in its most significant bit is inlined, and, so
// it won't be in the stash
if ((tree_branch_nodeids[j * 6] & 0x80) === 0) {
const subtree = stash.getWithOffsetKey(tree_branch_nodeids, j * 6);
if (subtree !== undefined) {
tree_branch_subtrees[j] = Promise.resolve(subtree);
}
}
j += 1;
}
j += 1;
}
if (tree instanceof PrefixSearchTree) {
const tree_mhp_branch_nodeids = tree.might_have_prefix_branches.nodeids;
const tree_mhp_branch_subtrees = tree.might_have_prefix_branches.subtrees;
j = 0;
lb = tree.might_have_prefix_branches.subtrees.length;
let j = 0;
const lb = tree.might_have_prefix_branches.subtrees.length;
while (j < lb) {
// node id with a 1 in its most significant bit is inlined, and, so
// it won't be in the stash