Skip to content

Latest commit

 

History

History
830 lines (648 loc) · 14.7 KB

24式加速你的Python.md

File metadata and controls

830 lines (648 loc) · 14.7 KB
jupyter
jupytext kernelspec
text_representation
extension format_name format_version jupytext_version
.md
markdown
1.2
1.4.1
display_name language name
Python 3
python
python3

一,分析代码运行时间

1,测算代码单次运行时间

# 平凡方法
import time
tic = time.time()
much_job = [x**2 for x in range(1,1000000,3)]
toc = time.time()
print('used {:.5}s'.format(toc-tic))
# 快捷方法(jupyter)
%%time
much_job = [x**2 for x in range(1,1000000,3)]

2,测算代码重复执行多次平均用时

# 平凡方法
from timeit import timeit
g = lambda x:x**2+1
def main():
    return(g(2)**120)

#timeit('main()',setup = 'from __main__ import main',number = 10)
timeit('main()',globals = {'main':main},number = 10)
# 快捷方法(jupyter)
%%timeit -n 10 
g = lambda x:x**2+1
def main():
    return(g(2)**120)
main()

3,按调用函数分析代码运行时间

# 平凡方法
def relu(x):
    return(x if x>0 else 0)
def main():
    result = [relu(x) for x in range(-100000,100000,1)]   
    return result
import profile
profile.run('main()')
# 快捷方法(jupyter)
%prun main()

4,按行分析代码运行时间

# 平凡方法
!pip install line_profiler
%load_ext line_profiler
def relu(x):
    return(x if x>0 else 0)
def main():
    result = [relu(x) for x in range(-100000,100000)]   
    return result
from line_profiler import LineProfiler
lprofile = LineProfiler(main,relu)
lprofile.run('main()')
lprofile.print_stats()
#快捷方法(jupyter)
%lprun -f main -f relu main()

二,加速你的查找

5,用set而非list进行in查找

#低速方法
data =  (i**2 + 1 for i in range(1000000))
list_data = list(data)
set_data = set(data)
%%time
1098987 in list_data
# 高速方法
%%time 
1098987 in set_data 

6,用dict而非两个list进行匹配查找

# 低速方法
list_a = [2*i-1 for  i in range(1000000)]
list_b = [i**2 for i in list_a ]
dict_ab = dict(zip(list_a,list_b))
%%time
print(list_b[list_a.index(876567)])
# 高速方法
%%time
print(dict_ab.get(876567,None))

三,加速你的循环

7,优先使用for循环而不是while循环

#低速方法
%%time
s,i = 0,0
while i<10000:
    i = i + 1
    s = s + i
print(s) 
#高速方法
%%time
s = 0
for i in range(1,10001):
    s = s + i 
print(s)

8,循环体中避免重复运算

# 低速方法
a = [i**2+1 for i in range(2000)]
%%time
b = [i/sum(a) for i in a]
# 高速方法
%%time
sum_a = sum(a)
b = [i/sum_a for i in a]

四,加速你的函数

9,用缓存机制加速递归函数

# 低速方法
%%time
def fib(n):
    return(1 if n in (1,2) else fib(n-1)+fib(n-2))
print(fib(30))
#高速方法
%%time
from functools import lru_cache

@lru_cache(100)
def fib(n):
    return(1 if n in (1,2) else fib(n-1)+fib(n-2))
print(fib(30))
fib.cache_info()

10,用循环取代递归函数

# 低速方法
%%time
def fib(n):
    return(1 if n in (1,2) else fib(n-1)+fib(n-2))
print(fib(30))
# 高速方法
%%time
def fib(n):
    if n in (1,2):
        return(1)
    a,b = 1,1
    for i in range(2,n):
        a,b = b,a+b
    return(b)
print(fib(30))

11, 使用Numba加速Python函数

# 低速方法
%%time
def my_power(x):
    return(x**2)

def my_power_sum(n):
    s = 0
    for i in range(1,n+1):
        s = s  + my_power(i)
    return(s)

print(my_power_sum(1000000))
# 高速方法
%%time
from numba import jit

@jit
def my_power(x):
    return(x**2)
@jit
def my_power_sum(n):
    s = 0
    for i in range(1,n+1):
        s = s  + my_power(i)
    return(s)

print(my_power_sum(1000000))

五,使用标准库函数进行加速

12,使用collections.Counter类加速计数

# 低速方法
data = [x**2%1989 for x in range(2000000)]
%%time
values_count = {}
for i in data:
    i_cnt = values_count.get(i,0)
    values_count[i] = i_cnt + 1
print(values_count.get(4,0))
# 高速方法
%%time
from collections import Counter
values_count = Counter(data)
print(values_count.get(4,0))

13, 使用collections.ChainMap加速字典合并

# 低速方法
dic_a = {i:i+1 for i in range(1,1000000,2)}
dic_b = {i:2*i+1 for i in range(1,1000000,3)}
dic_c = {i:3*i+1 for i in range(1,1000000,5)}
dic_d = {i:4*i+1 for i in range(1,1000000,7)}
%%time
result = dic_a.copy()
result.update(dic_b)
result.update(dic_c)
result.update(dic_d)
print(result.get(9999,0))
# 高速方法
%%time
from collections import ChainMap
chain = ChainMap(dic_a,dic_b,dic_c,dic_d)
print(chain.get(9999,0))

六,使用numpy向量化进行加速

14,使用np.array代替list

# 低速方法
%%time
a = range(1,1000000,3)
b = range(1000000,1,-3)
c = [3*a[i]-2*b[i] for i in range(0,len(a))]
# 高速方法
%%time
import numpy as np 
array_a = np.arange(1,1000000,3)
array_b = np.arange(1000000,1,-3)
array_c = 3*array_a - 2*array_b

15,使用np.ufunc代替math.func

# 低速方法
%%time
import math
a = range(1,1000000,3)
b = [math.log(x) for x in a]
# 高速方法
%%time
import numpy as np 
array_a = np.arange(1,1000000,3)
array_b = np.log(array_a)

16,使用np.where代替if

# 低速方法
import numpy as np 
array_a = np.arange(-100000,1000000)
%%time
# np.vectorize可以将普通函数转换成支持向量化的函数
relu = np.vectorize(lambda x: x if x>0 else 0)
array_b = relu(array_a)
# 高速方法
%%time
relu = lambda x:np.where(x>0,x,0)
array_b = relu(array_a)

七,加速你的Pandas

17,优先直接使用np.ufunc函数

# 低速方法
import numpy as np 
import pandas as pd 
df = pd.DataFrame(np.random.randint(-10,11,size = (100000,26)),
                  columns = list('abcdefghijklmnopqrstuvwxyz'))

%time dfresult = df.applymap(lambda x:np.sin(x)+np.cos(x))
# 高速方法
%%time
dfresult = np.sin(df) + np.cos(df)

18,避免动态改变DataFrame的行数

# 低速方法
%%time
import pandas as pd
import numpy as np
df = pd.DataFrame(columns = list('abcdefghijklmnopqrstuvwxyz') )
for i in range(10000):
    df.loc[i,:] = range(i,i+26)
# 高速方法
%%time
import pandas as pd
import numpy as np
df = pd.DataFrame(np.zeros((10000,26)),
                  columns = list('abcdefghijklmnopqrstuvwxyz'))
for i in range(10000):
    df.loc[i,:] = range(i,i+26)

19,使用csv文件读写代替xlsx文件读写

# 低速方法
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(-10,11,size=(10000,5)),
    columns = list('abced'))
# 低速方法
%%time
df.to_excel('data.xlsx')
# 高速方法
%%time
df.to_csv('data.csv')

20,使用pandas多进程工具pandarallel

# 低速方法
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(-10,11,size=(10000,26)),
                 columns = list('abcdefghijklmnopqrstuvwxyz'))
%%time
result = df.apply(np.sum,axis = 1) 
# 高速方法
!pip install pandarallel
%%time
from pandarallel import pandarallel 
pandarallel.initialize(nb_workers=4) 
result = df.parallel_apply(np.sum,axis = 1)  

八,使用Dask进行加速

21,使用dask加速dataframe

# 低速方法
import numpy as np
import pandas as pd                     

df = pd.DataFrame(np.random.randint(0,6,size=(100000000,5)),
                 columns = list('abcde'))   

%time df.groupby('a').mean()    
# 高速方法
!pip install dask 
import dask.dataframe as dd
df_dask = dd.from_pandas(df,npartitions=40)
%time df_dask.groupby('a').mean().compute()

22,使用dask.delayed应用多进程加速

# 低速方法
import time
def muchjob(x):
    time.sleep(5)
    return(x**2)
%%time
result = [muchjob(i) for i in range(5)]
result
# 高速方法
%%time
from dask import delayed,compute
from dask import threaded,multiprocessing
values = [delayed(muchjob)(i) for i in range(5)]
result = compute(*values,scheduler='multiprocessing')

九,应用多线程多进程加速

23,使用多线程提升IO密集任务效率

# 低速方法
%rm -rf *.txt
%%time
def writefile(i):
    with open(str(i)+'.txt','w') as f:
        s = ('hello %d'%i)*10000000
        f.write(s)
        
# 串行任务
for i in range(30):
    writefile(i)
# 高速方法
%%time
import threading

def writefile(i):
    with open(str(i)+'.txt','w') as f:
        s = ('hello %d'%i)*10000000
        f.write(s)

# 多线程任务
thread_list = []    
for i in range(30):
    t =threading.Thread(target=writefile,args=(i,))
    t.setDaemon(True)  #设置为守护线程
    thread_list.append(t)

for t in thread_list:
    t.start() #启动线程

for t in thread_list:
    t.join() #等待子线程结束

24,使用多进程提升CPU密集任务效率

# 低速方法
%%time
import time

def muchjob(x):
    time.sleep(5)
    return(x**2)

#串行任务
ans = [muchjob(i) for i in range(8)]
print(ans)
# 高速方法
%%time
import time
import multiprocessing

def muchjob(x):
    time.sleep(5)
    return(x**2)

#多进程任务
pool = multiprocessing.Pool(processes=4)
result = []
for i in range(8):
    result.append(pool.apply_async(muchjob, (i,)))
pool.close()
pool.join()
ans = [res.get() for res in result]
print(ans)