mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[BugFix] fix multimodal hasher hash collision risk when ndarray shape or dtype differs (#7185)
numpy tobytes() only serializes raw element bytes without encoding shape
or dtype metadata. This means arrays with identical raw bytes but
different shapes (e.g. (6,4) vs (4,6)) or different dtypes (e.g.
float32 vs uint8 reinterpretation of same memory) produce the same
SHA-256 digest, leading to silent cache collisions in
ProcessorCacheManager / EncoderCacheManager / PrefixCacheManager.
Prepend a "{shape}|{dtype}|" header to the byte payload before hashing
so that shape and dtype participate in the digest.
Added test cases for shape and dtype sensitivity.
This commit is contained in:
@@ -25,5 +25,9 @@ class MultimodalHasher:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def hash_features(cls, obj: object) -> str:
|
def hash_features(cls, obj: object) -> str:
|
||||||
if isinstance(obj, np.ndarray):
|
if isinstance(obj, np.ndarray):
|
||||||
return hashlib.sha256((obj.tobytes())).hexdigest()
|
# Encode shape and dtype into the hash to avoid collisions between
|
||||||
|
# arrays that share the same raw bytes but differ in layout, e.g.
|
||||||
|
# a (6,4) vs (4,6) array, or float32 vs uint8 reinterpretation.
|
||||||
|
header = f"{obj.shape}|{obj.dtype}|".encode()
|
||||||
|
return hashlib.sha256(header + obj.tobytes()).hexdigest()
|
||||||
return hashlib.sha256((pickle.dumps(obj))).hexdigest()
|
return hashlib.sha256((pickle.dumps(obj))).hexdigest()
|
||||||
|
|||||||
@@ -26,9 +26,23 @@ class TestHashFeatures(unittest.TestCase):
|
|||||||
"""Test hash features with numpy ndarray"""
|
"""Test hash features with numpy ndarray"""
|
||||||
arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8)
|
arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8)
|
||||||
arr_hash = MultimodalHasher.hash_features(arr)
|
arr_hash = MultimodalHasher.hash_features(arr)
|
||||||
target_hash = hashlib.sha256((arr.tobytes())).hexdigest()
|
header = f"{arr.shape}|{arr.dtype}|".encode()
|
||||||
|
target_hash = hashlib.sha256(header + arr.tobytes()).hexdigest()
|
||||||
assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}"
|
assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}"
|
||||||
|
|
||||||
|
def test_hash_features_ndarray_shape_sensitivity(self):
|
||||||
|
"""Arrays with same bytes but different shapes must produce different hashes"""
|
||||||
|
base = np.arange(24, dtype=np.float32)
|
||||||
|
a = base.reshape(6, 4)
|
||||||
|
b = base.reshape(4, 6)
|
||||||
|
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
|
||||||
|
|
||||||
|
def test_hash_features_ndarray_dtype_sensitivity(self):
|
||||||
|
"""Arrays with same shape but different dtypes must produce different hashes"""
|
||||||
|
a = np.zeros((4, 4), dtype=np.float32)
|
||||||
|
b = np.zeros((4, 4), dtype=np.float64)
|
||||||
|
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
|
||||||
|
|
||||||
def test_hash_features_object(self):
|
def test_hash_features_object(self):
|
||||||
"""Test hash features with unsupported object type"""
|
"""Test hash features with unsupported object type"""
|
||||||
obj = {"key": "value"}
|
obj = {"key": "value"}
|
||||||
|
|||||||
Reference in New Issue
Block a user