From 374945747652a8d32965591c0c01a00c88b7067f Mon Sep 17 00:00:00 2001 From: 3em0 <59153706+3em0@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:26:02 +0800 Subject: [PATCH] [BugFix] fix multimodal hasher hash collision risk when ndarray shape or dtype differs (#7185) numpy tobytes() only serializes raw element bytes without encoding shape or dtype metadata. This means arrays with identical raw bytes but different shapes (e.g. (6,4) vs (4,6)) or different dtypes (e.g. float32 vs uint8 reinterpretation of same memory) produce the same SHA-256 digest, leading to silent cache collisions in ProcessorCacheManager / EncoderCacheManager / PrefixCacheManager. Prepend a "{shape}|{dtype}|" header to the byte payload before hashing so that shape and dtype participate in the digest. Added test cases for shape and dtype sensitivity. --- fastdeploy/multimodal/hasher.py | 6 +++++- tests/multimodal/test_hasher.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fastdeploy/multimodal/hasher.py b/fastdeploy/multimodal/hasher.py index 6d2fc4f9b9..8793107d3d 100644 --- a/fastdeploy/multimodal/hasher.py +++ b/fastdeploy/multimodal/hasher.py @@ -25,5 +25,9 @@ class MultimodalHasher: @classmethod def hash_features(cls, obj: object) -> str: if isinstance(obj, np.ndarray): - return hashlib.sha256((obj.tobytes())).hexdigest() + # Encode shape and dtype into the hash to avoid collisions between + # arrays that share the same raw bytes but differ in layout, e.g. + # a (6,4) vs (4,6) array, or float32 vs uint8 reinterpretation. + header = f"{obj.shape}|{obj.dtype}|".encode() + return hashlib.sha256(header + obj.tobytes()).hexdigest() return hashlib.sha256((pickle.dumps(obj))).hexdigest() diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index a89ff2cf13..ea6368449d 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -26,9 +26,23 @@ class TestHashFeatures(unittest.TestCase): """Test hash features with numpy ndarray""" arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8) arr_hash = MultimodalHasher.hash_features(arr) - target_hash = hashlib.sha256((arr.tobytes())).hexdigest() + header = f"{arr.shape}|{arr.dtype}|".encode() + target_hash = hashlib.sha256(header + arr.tobytes()).hexdigest() assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}" + def test_hash_features_ndarray_shape_sensitivity(self): + """Arrays with same bytes but different shapes must produce different hashes""" + base = np.arange(24, dtype=np.float32) + a = base.reshape(6, 4) + b = base.reshape(4, 6) + assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b) + + def test_hash_features_ndarray_dtype_sensitivity(self): + """Arrays with same shape but different dtypes must produce different hashes""" + a = np.zeros((4, 4), dtype=np.float32) + b = np.zeros((4, 4), dtype=np.float64) + assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b) + def test_hash_features_object(self): """Test hash features with unsupported object type""" obj = {"key": "value"}