From a0fed22ddbbe7fa41b6feeef184063cc572a436b Mon Sep 17 00:00:00 2001 From: GoldPancake <56388518+Deleter-D@users.noreply.github.com> Date: Wed, 24 Dec 2025 15:00:06 +0800 Subject: [PATCH] [Feature] Add entropy calculation script --- scripts/calculate_avg_entropy.py | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 scripts/calculate_avg_entropy.py diff --git a/scripts/calculate_avg_entropy.py b/scripts/calculate_avg_entropy.py new file mode 100644 index 0000000000..f24c976cd5 --- /dev/null +++ b/scripts/calculate_avg_entropy.py @@ -0,0 +1,57 @@ +import argparse +import os +import re +from typing import List, Optional + + +def extract_entropy_values(log_path: str) -> List[float]: + pattern = r"entropy:\s*([0-9]+\.?[0-9]*(?:[eE][+-]?[0-9]+)?)" + + entropy_values = [] + with open(log_path, "r") as f: + lines = f.readlines() + for line in lines: + match = re.search(pattern, line) + if match: + try: + entropy_value = float(match.group(1)) + entropy_values.append(entropy_value) + except ValueError: + continue + + return entropy_values + + +def calculate_average(entropy_values: List[float], drop_ratio: float = 0.1) -> Optional[float]: + if not entropy_values: + return None + sorted_vals = sorted(entropy_values) + n = len(sorted_vals) + drop_count = int(n * drop_ratio) + filtered_vals = sorted_vals[drop_count : n - drop_count] if drop_count > 0 else sorted_vals + if not filtered_vals: + return None, [] + avg = sum(filtered_vals) / len(filtered_vals) + return avg, filtered_vals + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--log-dir", type=str, required=True) + parser.add_argument("--drop-ratio", "-d", type=float, default=0.1) + parser.add_argument("--verbose", "-v", action="store_true") + args = parser.parse_args() + entropy_values = extract_entropy_values(os.path.join(args.log_dir, "data_processor.log")) + average_entropy, filtered_vals = calculate_average(entropy_values, args.drop_ratio) + + print(f"{len(entropy_values)} entropy values were found") + print(f"effective entropy values: {len(filtered_vals)} (drop ratio {args.drop_ratio})") + print(f"Average entropy: {average_entropy:.10f}") + if args.verbose: + print("\nentropy details:") + for i, value in enumerate(filtered_vals, 1): + print(f" {i}. {value}") + + +if __name__ == "__main__": + main()