Files
PMail/server/hooks/spam_block/trec06c_format.py
T
Jinnrry 054336fe9e v2.6.1 (#169)
1、新增垃圾邮件过滤插件
2、使用使用github.com/dlclark/regexp2替换go原生的正则包
3、修复空数据导致的邮件插入失败
2024-07-20 10:39:17 +08:00

38 lines
1.1 KiB
Python

import os
from email.parser import Parser
from email.policy import default
# 该脚本用于整理trec06c数据集,可以生成训练集和测试集数据格式
def getData(path):
f = open(path, 'r', encoding='gb2312', errors='ignore')
data = f.read()
headers = Parser(policy=default).parsestr(data)
body = headers.get_payload()
body = body.replace("\n", "")
return headers["subject"], body
num = 0
# getData("../data/000/000")
with open("index", "r") as f:
with open("trec06c_train.csv", "w") as w:
with open("trec06c_test.csv", "w") as wt:
while True:
line = f.readline()
if not line:
break
infos = line.split(" ")
subject, body = getData(infos[1].strip())
tp = 0
if infos[0].lower() == "spam":
tp = 1
data = "{} \t{} {}\n".format(tp, subject, body)
if num < 55000:
w.write(data)
else:
wt.write(data)
num += 1
print(num)