python,txt标题清理代码

python,txt标题清理代码-源码网
python,txt标题清理代码
此内容为付费阅读,请付费后查看
9.9
立即购买
您当前未登录!建议登陆后购买,可保存购买订单
付费阅读
已售 2
import re
from bs4 import BeautifulSoup

# 删除长度大于 n 的行
def delcd(intxt,outtxt):
    # 读取文件
    with open(intxt, "r",encoding="utf8") as f:
        lines = f.readlines()

    # 大于6个字的行并且小于30的行保留
    lines = [line for line in lines if 6 < len(line) < 30]

    # 写入文件
    with open(outtxt, "w",encoding="utf8") as f:
        f.writelines(lines)

#去重行相同
def quchong(outtxt,qctxt):
    fi = open(outtxt, 'r',encoding="utf8")  # 打开需要处理的test.txt。
    txt = fi.readlines()
    with open(qctxt, 'a',encoding="utf8") as f:  # 创建处理去重复后的结果保存文档,防止找不到文件出错
        f.close()
    for w in txt:
        fi2 = open(qctxt, 'r',encoding="utf8")
        txt2 = fi2.readlines()
        with open(qctxt, 'a',encoding="utf8") as f:  # 打开目标文件开始写入
            if w not in txt2:  # 如果从源文档中读取的内容不在目标文档中则写入,否则跳过,实现去除重复功能!
                f.write(w)
            else:
                print("已去除重复-->" + w)
            f.close()
    fi.close()

#清理html和符号、替换内容
def cleartxt(qctxt,qltxt):
    # 读取 txt 文件
    with open(qctxt, "r",encoding="utf8") as f:
        text = f.read()
        #替换内容
        text = re.sub(r'鼠', '兔', text)
        text = re.sub(r'牛', '兔', text)
        text = re.sub(r'虎', '兔', text)
        text = re.sub(r'兔', '兔', text)
        text = re.sub(r'龙', '兔', text)
        text = re.sub(r'蛇', '兔', text)
        text = re.sub(r'马', '兔', text)
        text = re.sub(r'羊', '兔', text)
        text = re.sub(r'猴', '兔', text)
        text = re.sub(r'鸡', '兔', text)
        text = re.sub(r'狗', '兔', text)
        text = re.sub(r'猪', '兔', text)


    # 用 BeautifulSoup 库删除 HTML 代码
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # 用正则表达式删除标点符号
    text = re.sub(r"[^\w\s]", "", text)

    # 保存清理后的文本
    with open(qltxt, "w",encoding="utf8") as f:
        f.write(text)

if __name__ == '__main__':
    intxt = r"C:\Users\admin\Desktop\0\newwords.txt"#原始文件
    outtxt = r"C:\Users\admin\Desktop\0\1.txt" # # 大于6个字的行并且小于30字的行保留
    qctxt = r"C:\Users\admin\Desktop\0\2.txt"  # 去重相同行后保存
    qltxt = r"C:\Users\admin\Desktop\0\3.txt"  # 清理html和符号后保存
    delcd(intxt, outtxt)
    quchong(outtxt,qctxt)
    cleartxt(qctxt, qltxt)
© 版权声明
THE END
喜欢就支持一下吧
点赞13 分享