词频统计并输出。要求如下:
(1)对“红楼梦.xt”中文本进行分词,并对人物名称进行归一化处理,仅归一化一下内容:凤姐、凤姐
儿、凤丫头归一为凤姐;宝玉、二爷、宝二爷归一为宝玉;黛玉、颦儿、林妹妹、黛玉道归一为黛玉;宝
钗、宝丫头归一为宝钗;贾母、老祖宗归一为贾母;袭人、袭人道归一为袭人;贾政、贾政道归一为贾政;
贾琏、琏二爷归一为贾琏。
(2)不统计“停用词.txt”文件中包含词语的词频。
(3)提取出场次数不少于40次的人物名称,将人物名称及其出场次数按照递减排序,保存到result.csv文
件中,出场次数相同的,则按照人物名称的字符顺序排序。示例如下:
宝玉,123
凤姐,101
.格)
其中,人物名称与出场次数之间采用英文逗号分隔,无空格,每组信息一行。
参考答案
- import jieba
- f = "红楼梦.txt"
- sf = "停用词.txt"
- fi = open(f,"r",encoding="utf-8")
- data = fi.read()
- fi.close()
- fo = open(sf,"r",encoding="utf-8")
- words = fo.read()
- fo.close()
- #分词
- ls = jieba.lcut(data)
- d = {}
- word = ["一个","如今","一面","众人""说道","只见","不知","两个","起来","二人","今日","听见","不敢","不能","东西","只得","心中","回来","几个","原来","进来","出去","一时" ,"银子","起身","答应","回去"]
- for i in ls:
- if len(i) < 2 or i in words or i in word:
- continue#不统计
- #人物名词归一处理
- if i in ["凤姐","凤姐儿","凤丫头"]:
- i = "凤姐"
- elif i in ["宝玉","二爷","宝二爷"]:
- i = "宝玉"
- elif i in ["黛玉","颦儿","林妹妹","黛玉道"]:
- i = "黛玉"
- elif i in ["宝钗","宝丫头"]:
- i = "宝钗"
- elif i in ["贾母","老祖宗"]:
- i = "贾母"
- elif i in ["袭人","袭人道"]:
- i = "袭人"
- elif i in ["贾政","贾政道"]:
- i = "贾政"
- elif i in ["贾琏","琏二爷"]:
- i = "贾琏"
- d[i] = d.get(i,0)+1
- items = list(d.items())
- items.sort(key=lambda x:(x[1],x[0]), reverse=True)
- # 此行语句可以对items列表进行递减排序
- f = open("result.csv","w")
- for l in items:
- if l[1] <40:
- break
- f.write("{},{}\n".format(l[0],l[1]))
- f.close()