[BugFix] fix multimodal hasher hash collision risk when ndarray shape or dtype differs (#7185)

numpy tobytes() only serializes raw element bytes without encoding shape
or dtype metadata. This means arrays with identical raw bytes but
different shapes (e.g. (6,4) vs (4,6)) or different dtypes (e.g.
float32 vs uint8 reinterpretation of same memory) produce the same
SHA-256 digest, leading to silent cache collisions in
ProcessorCacheManager / EncoderCacheManager / PrefixCacheManager.

Prepend a "{shape}|{dtype}|" header to the byte payload before hashing
so that shape and dtype participate in the digest.

Added test cases for shape and dtype sensitivity.
This commit is contained in:
3em0
2026-04-08 19:26:02 +08:00
committed by GitHub
parent fbc3aa93de
commit 3749457476
2 changed files with 20 additions and 2 deletions
+15 -1
View File
@@ -26,9 +26,23 @@ class TestHashFeatures(unittest.TestCase):
"""Test hash features with numpy ndarray"""
arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8)
arr_hash = MultimodalHasher.hash_features(arr)
target_hash = hashlib.sha256((arr.tobytes())).hexdigest()
header = f"{arr.shape}|{arr.dtype}|".encode()
target_hash = hashlib.sha256(header + arr.tobytes()).hexdigest()
assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}"
def test_hash_features_ndarray_shape_sensitivity(self):
"""Arrays with same bytes but different shapes must produce different hashes"""
base = np.arange(24, dtype=np.float32)
a = base.reshape(6, 4)
b = base.reshape(4, 6)
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
def test_hash_features_ndarray_dtype_sensitivity(self):
"""Arrays with same shape but different dtypes must produce different hashes"""
a = np.zeros((4, 4), dtype=np.float32)
b = np.zeros((4, 4), dtype=np.float64)
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
def test_hash_features_object(self):
"""Test hash features with unsupported object type"""
obj = {"key": "value"}