对c++或者java熟悉的同窗,写python代码时一般会用c++,java方式.有些状况下,用python的方法实现一些功能会更方便.java
# coding:utf-8 import os filepath = r"D:\test" files = [f for f in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, f))] print(files)
列出文件夹filepath下的全部文件名.一行代码解决.同理能够列出全部文件夹.注意,没有列出子目录的内容.python
# coding:utf-8 print(3 * [2]) # [2, 2, 2]
结果是一个数组.这个技巧在分类,聚类算法中,初始化类编号最经常使用.c++
常常初用来构建一个乱序的numpy类型的数组.算法
# coding:utf-8 import numpy as np random_state = np.random.RandomState(0) indices = np.arange(100) random_state.shuffle(indices) print(indices)
np.random.RandomState的参数同样时,构造的数组必定同样.不一样的参数构建的数组必定不同.mongodb
python内部编码只多是UCS-2,UCS-4中的某一种.sys.maxunicode为65535时表示该版本内部编码是unicode是UCS-2,sys.maxunicode为1114111时, 表示该版本内部编码是UCS-4.canvas
print(sys.maxunicode)
labels = [(1, 2), (3, 4), (5, 6)] labels, categories = zip(*labels) print(labels) print(categories)
能够把一个元素组成的数组转化成2个数组.也能够把2个ndarray合成一个tuple数组
import numpy as np a = np.array( [[1, 2], [3, 4], [5, 6], [2, 3], [6, 9]] ) b = np.array([[1], [2], [3], [4], [5]]) for c in zip(a, b): print(c)
用numpy.in1d()能够构建一个bool类型的数组.经过该数组,能够把数组中的一些元素去掉.这在分类算法中去掉一些数据集时很是有用.app
# coding:utf-8 import numpy as np a = [1, 2, 3, 4, 5] b = [1, 4] mask = np.in1d(a, b) names = np.array(["aaa", "bbb", "ccc", "ddd", "fff"]) names = names[mask] print(names) # ['aaa' 'ddd']
# coding:utf-8 from urllib.request import urlopen URL = "http://download.labs.sogou.com/dl/sogoulabdown/categories_2012.txt" opener = urlopen(URL) with open("test.txt", 'wb') as f: f.write(opener.read())
在处理大文件时常常会遇到这个问题.求一大批文档的tfidf时会产生一个很大但很稀疏的矩阵,而numpy的各类运算的参数又是numpy数组.不能把稀疏矩阵直接转化成numpy数组(内在装不上),解决方法是在预处理的时候把稀疏矩阵存成不少小文件,好比50行存成一个小文件,在训练的时候每次读取一个小文件.这就现实了小内存处理大文件.dom
import pickle import numpy as np fpath = r"D:\seri.dat" a = {} a['aaa'] = 1 a['bbb'] = 2 b = np.array([1, 2, 3]) with open(fpath, 'wb') as f: pickle.dump(b, f) # 把dict存成一个文件 with open(fpath, 'rb') as f: obj2 = pickle.load(f) print(obj2) # 把dict读到内存中
建立矩阵函数
import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer corpus = np.array(["aaa bbb ccc", "aaa bbb ddd"]) cv1 = CountVectorizer() cv1output = cv1.fit_transform(corpus) print(cv1.get_feature_names()) tfidfTrans1 = TfidfTransformer() print(tfidfTrans1.fit_transform(cv1output))
tfidfTrans1就是最终的tfidf矩阵.这时候有一个测试集("aaa vvv ccc", "ccc ccc rrr"),注意vvv,rrr都不在训练集中,要忽略.因此要以训练集的单词为基准,创建测试矩阵.
import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer corpus = np.array(["aaa bbb ccc", "aaa bbb ddd"]) cv1 = CountVectorizer() cv1output = cv1.fit_transform(corpus) print(cv1.get_feature_names()) tfidfTrans1 = TfidfTransformer() print(tfidfTrans1.fit_transform(cv1output)) corpus1 = np.array(["aaa vvv ccc", "ccc ccc rrr"]) cv2 = CountVectorizer(vocabulary=cv1.vocabulary_) cv2output = cv2.fit_transform(corpus1) tfidfTrans2 = TfidfTransformer() print(tfidfTrans2.fit_transform(cv2output))
# coding:utf-8 import numpy as np def dense_to_one_hot(input_data, class_num): data_num = input_data.shape[0] # numpy.arange(num_labels)产生一个[0,1,2,3,4,5,6,7,8,9,0,1,3]的数组,* num_classes是把全部数乘以10 index_offset = np.arange(data_num) * class_num # [0,10,20,30,40,50,60,70,80,90,100,110,120] labels_one_hot = np.zeros((data_num, class_num)) # (13*10)的数组 # index_offset [0,10,20,30,40,50,60,70,80,90,100,110, 120] # input_data.ravel() [0,1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 3] # sum [0,11,22,33,44,55,66,77,88,99,100, 111, 123] labels_one_hot.flat[index_offset + input_data.ravel()] = 1 return labels_one_hot input_data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 3]) class_num = 10 print(dense_to_one_hot(input_data, class_num))
tuple能够表示不变集合,例如,一个点的二维坐标就能够表示成:
p = (1, 2)
可是,看到(1, 2),很难看出这个tuple是用来表示一个坐标的.定义一个class又小题大作了,这时,namedtuple就派上了用场:
from collections import namedtuple Point = namedtuple('Point', ['x', 'y']) p = Point(1, 2) print(p.x) print(p.y)
参考资料:廖雪峰的官方网站
*args表示传递不定长的参数.
def fun_var_args(farg, *args): print("arg:", farg) for value in args: print("another arg:", value) fun_var_args(1, "two", 3) # *args能够看成可容纳多个变量组成的list
**kwargs也表示传递不定长的参数.和*args的区别是**kwargs传的是key, value的结构.
def fun_var_kwargs(farg, **kwargs): print("arg:", farg) for key in kwargs: print("another keyword arg: %s: %s" % (key, kwargs[key])) fun_var_kwargs(farg=1, myarg2="two", myarg3=3) # myarg2和myarg3被视为key, 感受**kwargs能够看成容纳多个key和value的dictionary
原生的方法
# coding:utf-8 import sys print(sys.argv[1]) print(sys.argv[2])
用tensorflow的方法
# coding:utf-8 import tensorflow as tf flags = tf.app.flags flags.DEFINE_string("zipfilepath", "a", "zip file path") flags.DEFINE_string("unzipfolder", "b", "unzip folder") FLAGS = flags.FLAGS print(FLAGS.zipfilepath) print(FLAGS.unzipfolder)
import zipfile zip_ref = zipfile.ZipFile(r"D:\test.zip") zip_ref.extractall(r"D:\unfolder") zip_ref.close()
python setup.py sdist --formats=gztar
Z[:] = [0 if x > 0.5 else 1 for x in Z]
a = dict() a[1] = 2 a[2] = 3 a[3] = 4 d = {v: k for k, v in a.items()} print(d)
import itertools a = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] print(list(itertools.chain.from_iterable(a)))
import numpy as np matrix = np.random.random([1024, 64]) # 64-dimensional embeddings ids = np.array([0, 5, 17, 33]) print(matrix[ids].shape) # prints a matrix of shape [4, 64]
能够对比tensorflow的tf.nn.embedding_lookup功能
import platform if platform.python_version().startswith("3"): print("a")
import numpy as np a = np.random.rand(10) print(a) b = np.around(a) print(b.astype(int))
data = dict() data[1] = 2 data[13] = 1 data[5] = 9 count_pairs = sorted(data.items()) print(count_pairs) count_pairs = sorted(data.items(), key=lambda x: (x[1], x[0])) print(count_pairs)
import numpy as np def rand_arr(a, b, *args): np.random.seed(0) return np.random.rand(*args) * (b - a) + a a = rand_arr(0, 1, 2, 3) print(a)
双星表示平方
import numpy as np a = np.array([1, 2, 3]) print(a ** a)
输出结果为[ 1 4 27]
import numpy as np a = np.random.choice(2, 50000, p=[0.5, 0.5]) print(len(a)) print(a[0: 10])
import numpy as np x = np.arange(10) print(x) print(np.roll(x, 3))
import numpy as np arr1 = np.arange(12).reshape(2, 2, 3) print("---------------------------------转换前---------------------------------") print(arr1) print("---------------------------------转换后---------------------------------") print(arr1.transpose((1, 0, 2))) arr1 = np.arange(12).reshape(2, 2, 3) print("---------------------------------转换前---------------------------------") print(arr1) print("---------------------------------转换后---------------------------------") print(arr1.transpose((0, 2, 1)))
用"数组的数组"来理解多维数组.arr1[2][2][3]是一个有2个元素的数组,每一个元素又是长度为2的数组,而长度为2的数组的每一个元素又是一个长度为3的数组.arr1.transpose((1, 0, 2))的意思是第3维不变.能够这样认为,arr1原始的结构以下:
\[ \begin{equation*} \left[ \begin{array}{cc} A & B \\ C & D \\ \end{array} \right] \end{equation*} \]
其中A=[0, 1, 2],B=[3, 4, 5],C=[6, 7, 8],D=[9, 10, 11],如今要转置第1维和第2维,因此转后为
\[ \begin{equation*} \left[ \begin{array}{cc} A & C \\ B & D \\ \end{array} \right] \end{equation*} \]
A,B,C,D的内容不变,这就解释了arr1.transpose((1, 0, 2)的值.用相似的思路能够解释arr1.transpose((0, 2, 1)的值.对于arr1.transpose((0, 2, 1)能够认为是第1维不变,第2,3维转置.第1维的每一个元素都是一个2*3的矩阵,转置后变成3*2,这就解释了arr1.transpose((0, 2, 1)的输出.
一般状况下从dict中按key取一个值,若是key不存在会报错.能够用defaultdict定义dict,key不存在时不会报错.
from collections import defaultdict a = defaultdict(int) a["3"] = 1 print(a["3"]) print(a["45"])
print('{0:.2f} finished. Epoch {1}'.format(1.1234, 2.3354)) g = "{0:.2f}, {1}".format(1.1234, "aa")
能够像使用属性那样用函数.
# coding:utf-8 class Person(object): def __init__(self, first_name, last_name): """Constructor""" self.first_name = first_name self.last_name = last_name @property def full_name(self): return "%s %s" % (self.first_name, self.last_name) person = Person("zhang", "san") print(person.full_name) # 若是去掉 @property就显示不出来full name
class Person(object): def __init__(self, name, gender): self.name = name self.gender = gender def __call__(self, friend): print('My name is %s...' % self.name) print('My friend is %s...' % friend) p = Person('Bob', 'male') p('Tim') # 对象能够看成方法使用,调用的是__call__函数
# coding:utf-8 import os command = 'ps a' with os.popen(command) as p: info = p.read() print(info)
# coding:utf-8 from __future__ import division from __future__ import absolute_import from __future__ import print_function import hashlib s1 = "中华人民共和国" s2 = "美国" print(hashlib.md5(s1.encode("utf-8")).hexdigest()) print(hashlib.md5(s1.encode("utf-8")).hexdigest()) print(hashlib.md5(s2.encode("utf-8")).hexdigest())
# coding:utf-8 from __future__ import division from __future__ import absolute_import from __future__ import print_function from pymongo import MongoClient client = MongoClient('localhost', 27017) db = client.weichat db.docs.insert_one( {"class_type": "canvas", "content": "春江潮水连海平", })
a = "aaa bbb ccc ddd eee aaa bbb aaa aaa" print(a.count("aaa"))
# coding:utf-8 class MyClass: def __init__(self): self.name = "xiaohua" def process(self): return self.name t = MyClass() print(hasattr(t, "name")) # name属性是否存在 print(hasattr(t, "process")) # process属性是否存在 print(getattr(t, "name")) # 获取name属性值,存在就打印出来 print(getattr(t, "process")) # 获取run方法,存在就打印出方法的内存地址 print(getattr(t, "process")()) # 获取process方法,后面加括号能够将这个方法运行 print(getattr(t, "age", "18"))
# coding:utf-8 import platform print(platform.platform())
# coding:utf-8 from __future__ import print_function import argparse def build_parser(): parser = argparse.ArgumentParser() parser.add_argument('--run_type', type=str, required=True) args = parser.parse_args() return args if __name__ == '__main__': args = build_parser() if args.run_type == "train": print("train") else: print("test")
# coding:utf-8 import re import jieba WORD_FORMAT = r"[\u4e00-\u9fa5A-Za-z]+$" content = "咱们都有一个家,名字叫中国08" seg_list = jieba.cut(content) pattern = re.compile(WORD_FORMAT) doc = " ".join(word for word in seg_list if pattern.search(word)) print(doc)
m = ["a", "b", "c", "d", "e", "f"] print(m[::-1])
m = ["a", "b", "c", "d", "e", "f"] print(m[-3:])