Python全栈开发之Python基础-字符编码与转码html
详细文章:python
http://www.cnblogs.com/yuanchenqi/articles/5956943.htmlpython2.7
http://www.diveintopython3.net/strings.html网站
需知:编码
1.在python2默认编码是ASCII, python3里默认是utf-8spa
2.unicode 分为 utf-32(占4个字节),utf-16(占两个字节),utf-8(占1-4个字节), so utf-8就是unicode.net
3.在py3中encode,在转码的同时还会把string 变成bytes类型,decode在解码的同时还会把bytes变回stringcode
1、python2htm
2、python3blog
编码应用比较多的场景应该是爬虫了,互联网上不少网站用的编码格式很杂,虽然总体趋向都变成utf-8,但如今仍是很杂,因此爬网页时就须要你进行各类编码的转换,不过生活正在变美好,期待一个不须要转码的世界。
最后,编码is a piece of fucking shit, noboby likes it.
ps:
python2 的用法
1 [root@python2 scripts]# cat encode.py 2 #!/usr/bin/env python 3 # -*- coding:utf-8 -*- 4 #Author: nulige 5 6 import sys 7 print(sys.getdefaultencoding()) 8 9 s = "你好" 10 s_to_unicode = s.decode("utf-8") 11 print(s_to_unicode) 12 s_to_gbk = s_to_unicode.encode("gbk") 13 print(s_to_gbk) 14 15 gbk_to_utf8 = s_to_gbk.decode("gbk").encode("utf-8") 16 print(gbk_to_utf8)
执行结果:
1 [root@python2 scripts]# python encode.py 2 ascii #系统默认编码 3 你好 4 ?oí 5 你好 #gbk转成utf-8
utf-8是unicode的扩展集
1 [root@python2 scripts]# cat encode.py 2 #!/usr/bin/env python 3 # -*- coding:utf-8 -*- 4 #Author: nulige 5 6 import sys 7 print(sys.getdefaultencoding()) 8 9 s = u"你好" 10 print(s) 11 12 s_to_unicode = s.decode("utf-8") 13 print(s_to_unicode) 14 s_to_gbk = s_to_unicode.encode("gbk") 15 print(s_to_gbk) 16 17 gbk_to_utf8= s_to_gbk.decode("gbk").encode("utf-8") 18 print(gbk_to_utf8)
执行结果:
1 [root@python2 scripts]# python encode.py 2 ascii 3 你好 #utf-8是unicode的扩展集,因此这里也是能够显示中文的 4 Traceback (most recent call last): 5 File "encode.py", line 11, in <module> 6 s_to_unicode = s.decode("utf-8") 7 File "/usr/local/lib/python2.7/encodings/utf_8.py", line 16, in decode 8 return codecs.utf_8_decode(input, errors, True) 9 UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
1 [root@python2 scripts]# cat encode.py 2 #!/usr/bin/env python 3 # -*- coding:utf-8 -*- 4 #Author: nulige 5 6 import sys 7 print(sys.getdefaultencoding()) 8 9 s = u"你好" 10 print(s) 11 12 s_to_gbk = s.encode("gbk") 13 print(s_to_gbk) 14 15 gbk_to_utf8= s_to_gbk.decode("gbk").encode("utf-8") 16 print(gbk_to_utf8)
执行结果:
1 [root@python2 scripts]# python encode.py 2 ascii 3 你好 4 ?oí 5 你好
python3
1 #!/usr/bin/env python 2 #Author: nulige 3 4 import sys 5 print(sys.getdefaultencoding()) 6 7 s = "你哈" #默认是utf-8 8 s_gbk = s.encode("gbk") #utf-8转成gbk 9 10 print(s_gbk) 11 print(s.encode())
执行结果:
1 utf-8 #python默认是utf-8 2 b'\xc4\xe3\xb9\xfe' #utf-8转成gbk
3 b'\xe4\xbd\xa0\xe5\x93\x88'
1 #!/usr/bin/env python 2 #Author: nulige 3 4 import sys 5 print(sys.getdefaultencoding()) 6 7 s = "你哈" 8 s_gbk = s.encode("gbk") 9 10 print(s_gbk) 11 print(s.encode()) 12 13 gbk_to_utf8 = s_gbk.decode("gbk").encode("utf-8") #gbk转成utf-8 14 print("utf8",gbk_to_utf8)
执行结果:
1 utf-8 2 b'\xc4\xe3\xb9\xfe' 3 b'\xe4\xbd\xa0\xe5\x93\x88' 4 utf8 b'\xe4\xbd\xa0\xe5\x93\x88'
总结
把PyCharm字符编码调成gbk
1 #!/usr/bin/env python 2 # -*-coding:gbk-*- 3 #Author: nulige 4 5 #不一样字符编码要先转成uncode 6 import sys 7 print(sys.getdefaultencoding()) 8 9 s = '你好' #默认uncode 10 print(s.encode("gbk")) 11 print(s.encode("utf-8")) 12 print(s.encode("utf-8").decode("utf-8").encode("gb2312")) 13 print(s.encode("utf-8").decode("utf-8").encode("gb2312").decode("gb2312"))
执行结果:
1 utf-8 2 b'\xc4\xe3\xba\xc3' 3 b'\xe4\xbd\xa0\xe5\xa5\xbd' 4 b'\xc4\xe3\xba\xc3' 5 你好
做业:
python2 or python 3
记住:全部字符集的转换,都要通过unicode一、把gbk2312 to utf-8二、把utf-8 to gbk