import jieba
with open('./nlp_test0.txt') as f:
document = f.read()
document_decode = document.decode('GBK')
document_cut = jieba.cut(document_decode)
#print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示
result = ' '.join(document_cut)
result = result.encode('utf-8')
with open('./nlp_test1.txt', 'w') as f2:
f2.write(result)
f.close()
f2.close()
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-55-40dd9f231eb6> in <module>
4
5 with open('./nlp_test0.txt') as f:
----> 6 document = f.read()
7
8 document_decode = document.decode('GBK')
UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 32: illegal multibyte sequence
你的文本是gbk的
或者是包含了gbk的字节
可以用decode('gbk', 'ignore')忽略非法字符,或者换个编码试试
这是一个编码错误,把gbk编码转换成utf8编码