mirror of
https://github.com/Jinnrry/PMail.git
synced 2026-04-23 00:17:16 +08:00
054336fe9e
1、新增垃圾邮件过滤插件 2、使用使用github.com/dlclark/regexp2替换go原生的正则包 3、修复空数据导致的邮件插入失败
38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
import os
|
|
from email.parser import Parser
|
|
from email.policy import default
|
|
|
|
# 该脚本用于整理trec06c数据集,可以生成训练集和测试集数据格式
|
|
|
|
def getData(path):
|
|
f = open(path, 'r', encoding='gb2312', errors='ignore')
|
|
data = f.read()
|
|
headers = Parser(policy=default).parsestr(data)
|
|
body = headers.get_payload()
|
|
body = body.replace("\n", "")
|
|
|
|
return headers["subject"], body
|
|
|
|
|
|
num = 0
|
|
|
|
# getData("../data/000/000")
|
|
with open("index", "r") as f:
|
|
with open("trec06c_train.csv", "w") as w:
|
|
with open("trec06c_test.csv", "w") as wt:
|
|
while True:
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
infos = line.split(" ")
|
|
subject, body = getData(infos[1].strip())
|
|
tp = 0
|
|
if infos[0].lower() == "spam":
|
|
tp = 1
|
|
data = "{} \t{} {}\n".format(tp, subject, body)
|
|
if num < 55000:
|
|
w.write(data)
|
|
else:
|
|
wt.write(data)
|
|
num += 1
|
|
print(num) |