View difference between Paste ID: MY85zL6U and xDfZXKTq
SHOW: | | - or go back to the newest paste.
1
## file foo.f90 ##
2
function pixsolve(A, b) result(x)
3
    implicit none
4
    real*8    :: A(:,:,:), b(:,:), x(size(b,1),size(b,2))
5
    integer*4 :: i, n, m, piv(size(b,2)), err
6
    n = size(A,1)
7
    m = size(A,2)
8
    x = b
9
    do i = 1, n
10
        call dgesv(m, 1, A(i,:,:), m, piv, x(i,:), m, err)
11
    end do
12
end function
13
14-
! f2py -c -m foo{,.f90} -llapack -lblas
14+
## file foo_faster.f90 ##
15-
! python
15+
16-
! >>> import numpy as np
16+
17-
! >>> from foo import pixsolve
17+
18-
! >>> n = 6
18+
    integer*4 :: i, n, m, piv(size(b,1)), err
19-
! >>> A = np.random.rand(n, 3, 3)
19+
    n = size(A,3)
20-
! >>> b = np.random.rand(n, 3)
20+
    m = size(A,1)
21-
! >>> x = pixsolve(A, b)  # slightly faster (5%) if A, b are in Fortran order
21+
22-
! >>> max(np.linalg.norm(np.dot(A[i], x[i]) - b[i]) for i in xrange(n))
22+
23-
! 2.603703785810335e-16
23+
        call dgesv(m, 1, A(:,:, i), m, piv, x(:, i), m, err)
24
    end do
25
end function
26
27
## Run f2py ##
28
f2py -c -m foo{,.f90} -llapack -lblas
29
f2py -c -m foo_faster{,.f90} -llapack -lblas
30
31
## file test.ipy (to be ran in IPython) ##
32
import numpy as np
33
from foo import pixsolve
34
from foo_faster import pixsolve as pixsolve_faster
35
36
n = 600
37
A = np.random.rand(n, 3, 3)
38
b = np.random.rand(n, 3)
39
A0 = A.copy()  # Store a copy to check solutions, dgesv modifies the input matrix
40
41
x1 = pixsolve(A, b)  # slightly faster (5%) if A, b were in Fortran order
42
print max(np.linalg.norm(np.dot(A0[i], x1[i]) - b[i]) for i in xrange(n))
43
44
A = A0.copy()
45
x2 = pixsolve_faster(A.T, b.T).T
46
# But you're solving for the transpose of each of your matrices!!
47
print max(np.linalg.norm(np.dot(A0[i].T, x2[i]) - b[i]) for i in xrange(n))
48
49
A = A0.copy()
50
x3 = pixsolve_faster(A.transpose((1, 2, 0)), b.T).T
51
print max(np.linalg.norm(np.dot(A0[i], x3[i]) - b[i]) for i in xrange(n))
52
53
54
%timeit pixsolve(A, b)
55
%timeit pixsolve_faster(A.T, b.T).T
56
%timeit pixsolve_faster(A.transpose((1, 2, 0)), b.T).T
57
58
## Results ## 
59
3.47321633195e-14
60
4.0549756099e-14
61
3.47321633195e-14
62
1000 loops, best of 3: 830 us per loop
63
1000 loops, best of 3: 655 us per loop
64
1000 loops, best of 3: 688 us per loop