diff --git a/.jules/bolt.md b/.jules/bolt.md
new file mode 100644
index 0000000000..9af9428c93
--- /dev/null
+++ b/.jules/bolt.md
@@ -0,0 +1,3 @@
+## 2026-04-19 - Unnecessary dtype conversions in hot paths
+**Learning:** In PaddlePaddle, calling `.astype(dtype)` creates a new tensor and dispatches a kernel even when the tensor is already of the target dtype, which can slow down hot paths like RMSNorm.
+**Action:** Add explicit conditional checks (`if tensor.dtype != target_dtype`) before calling `.astype` in frequently executed methods to save memory allocations and kernel dispatch overheads.
diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py
index 4a2cf32d1b..2c51462864 100644
--- a/fastdeploy/model_executor/layers/normalization.py
+++ b/fastdeploy/model_executor/layers/normalization.py
@@ -232,10 +232,12 @@ class RMSNorm(nn.Layer):
                   operations (like linear transformation) on the `residual_input`.
         """
         x_dtype = x.dtype
-        x = x.astype(self.weight.dtype)
+        if x.dtype != self.weight.dtype:
+            x = x.astype(self.weight.dtype)
         if residual_input is not None:
             residual_input_dtype = residual_input.dtype
-            residual_input = residual_input.astype(self.weight.dtype)
+            if residual_input.dtype != self.weight.dtype:
+                residual_input = residual_input.astype(self.weight.dtype)
 
         if residual_input is None:
             residual_out = x
@@ -276,9 +278,13 @@ class RMSNorm(nn.Layer):
                 x = x + residual_input
             norm_out = proxy_rmsnorm(x, self.weight, self.eps), x
 
-        out = norm_out[0].astype(x_dtype)
+        out = norm_out[0]
+        if out.dtype != x_dtype:
+            out = out.astype(x_dtype)
         if residual_input is not None:
-            residual_out = norm_out[1].astype(residual_input_dtype)
+            residual_out = norm_out[1]
+            if residual_out.dtype != residual_input_dtype:
+                residual_out = residual_out.astype(residual_input_dtype)
 
         if self.split_x:
             assert residual_out is not None