diff --git a/examples/cython/fib/fib-exercise/fib.pyx b/examples/cython/fib/fib-exercise/fib.pyx index 8aea37b..b423d23 100644 --- a/examples/cython/fib/fib-exercise/fib.pyx +++ b/examples/cython/fib/fib-exercise/fib.pyx @@ -7,8 +7,8 @@ function (named so we can distinguish it from the sped-up versions of `fib()`). For baseline timings, lets time its performance in the IPython interpreter: - $ ipython - [...] + $ ipython --no-banner + In [1]: from pyfib import pyfib In [2]: %timeit pyfib(10) @@ -34,8 +34,8 @@ directory. 2. You can load this extension module in an interactive interpreter (here, IPython), like so: - $ ipython - [...] + $ ipython --no-banner + In [1]: from fib import fib In [2]: fib(10) diff --git a/examples/cython/fib/fib-exercise/fib_solution.pyx b/examples/cython/fib/fib-exercise/fib_solution.pyx index b6d7679..dc12597 100644 --- a/examples/cython/fib/fib-exercise/fib_solution.pyx +++ b/examples/cython/fib/fib-exercise/fib_solution.pyx @@ -7,8 +7,8 @@ function (named so we can distinguish it from the sped-up versions of `fib()`). For baseline timings, lets time its performance in the IPython interpreter: - $ ipython - [...] + $ ipython --no-banner + In [1]: from pyfib import pyfib In [2]: %timeit pyfib(10) @@ -34,8 +34,8 @@ directory. 2. You can load this extension module in an interactive interpreter (here, IPython), like so: - $ ipython - [...] + $ ipython --no-banner + In [1]: from fib import fib In [2]: fib(10) diff --git a/examples/cython/fib/fib-exercise/pyfib.py b/examples/cython/fib/fib-exercise/pyfib.py index 5dbd284..759f35c 100644 --- a/examples/cython/fib/fib-exercise/pyfib.py +++ b/examples/cython/fib/fib-exercise/pyfib.py @@ -7,8 +7,8 @@ For baseline timings, lets time its performance in the IPython interpreter: - $ ipython - [...] + $ ipython --no-banner + In [1]: from pyfib import pyfib In [2]: %timeit pyfib(10) @@ -34,8 +34,8 @@ 2. You can load this extension module in an interactive interpreter (here, IPython), like so: - $ ipython - [...] + $ ipython --no-banner + In [1]: from fib import fib In [2]: fib(10) diff --git a/examples/cython/fib/fib-exercise/setup_fib.py b/examples/cython/fib/fib-exercise/setup_fib.py index f79e247..497d0ec 100644 --- a/examples/cython/fib/fib-exercise/setup_fib.py +++ b/examples/cython/fib/fib-exercise/setup_fib.py @@ -7,8 +7,8 @@ For baseline timings, lets time its performance in the IPython interpreter: - $ ipython - [...] + $ ipython --no-banner + In [1]: from pyfib import pyfib In [2]: %timeit pyfib(10) @@ -34,8 +34,8 @@ 2. You can load this extension module in an interactive interpreter (here, IPython), like so: - $ ipython - [...] + $ ipython --no-banner + In [1]: from fib import fib In [2]: fib(10) diff --git a/examples/cython/fib/fib-exercise/setup_fib_solution.py b/examples/cython/fib/fib-exercise/setup_fib_solution.py index 785ab55..90668b4 100644 --- a/examples/cython/fib/fib-exercise/setup_fib_solution.py +++ b/examples/cython/fib/fib-exercise/setup_fib_solution.py @@ -7,8 +7,8 @@ For baseline timings, lets time its performance in the IPython interpreter: - $ ipython - [...] + $ ipython --no-banner + In [1]: from pyfib import pyfib In [2]: %timeit pyfib(10) @@ -34,8 +34,8 @@ 2. You can load this extension module in an interactive interpreter (here, IPython), like so: - $ ipython - [...] + $ ipython --no-banner + In [1]: from fib import fib In [2]: fib(10) diff --git a/pdf/02_Speeding_Python.pdf b/pdf/02_Speeding_Python.pdf deleted file mode 100644 index 752c8e4..0000000 Binary files a/pdf/02_Speeding_Python.pdf and /dev/null differ diff --git a/pdf/02_Speeding_Python.tex b/pdf/02_Speeding_Python.tex deleted file mode 100644 index 581089b..0000000 --- a/pdf/02_Speeding_Python.tex +++ /dev/null @@ -1,684 +0,0 @@ -%% This file was auto-generated by IPython, do NOT edit -%% Conversion from the original notebook file: -%% 02_Speeding_Python.ipynb -%% -\documentclass[11pt,english]{article} - -%% This is the automatic preamble used by IPython. Note that it does *not* -%% include a documentclass declaration, that is added at runtime to the overall -%% document. - -\usepackage{amsmath} -\usepackage{amssymb} -\usepackage{graphicx} -\usepackage{ucs} -\usepackage[utf8x]{inputenc} - -% needed for markdown enumerations to work -\usepackage{enumerate} - -% Slightly bigger margins than the latex defaults -\usepackage{geometry} -\geometry{verbose,tmargin=3cm,bmargin=3cm,lmargin=2.5cm,rmargin=2.5cm} - -% Define a few colors for use in code, links and cell shading -\usepackage{color} -\definecolor{orange}{cmyk}{0,0.4,0.8,0.2} -\definecolor{darkorange}{rgb}{.71,0.21,0.01} -\definecolor{darkgreen}{rgb}{.12,.54,.11} -\definecolor{myteal}{rgb}{.26, .44, .56} -\definecolor{gray}{gray}{0.45} -\definecolor{lightgray}{gray}{.95} -\definecolor{mediumgray}{gray}{.8} -\definecolor{inputbackground}{rgb}{.95, .95, .85} -\definecolor{outputbackground}{rgb}{.95, .95, .95} -\definecolor{traceback}{rgb}{1, .95, .95} - -% Framed environments for code cells (inputs, outputs, errors, ...). The -% various uses of \unskip (or not) at the end were fine-tuned by hand, so don't -% randomly change them unless you're sure of the effect it will have. -\usepackage{framed} - -% remove extraneous vertical space in boxes -\setlength\fboxsep{0pt} - -% codecell is the whole input+output set of blocks that a Code cell can -% generate. - -% TODO: unfortunately, it seems that using a framed codecell environment breaks -% the ability of the frames inside of it to be broken across pages. This -% causes at least the problem of having lots of empty space at the bottom of -% pages as new frames are moved to the next page, and if a single frame is too -% long to fit on a page, will completely stop latex from compiling the -% document. So unless we figure out a solution to this, we'll have to instead -% leave the codecell env. as empty. I'm keeping the original codecell -% definition here (a thin vertical bar) for reference, in case we find a -% solution to the page break issue. - -%% \newenvironment{codecell}{% -%% \def\FrameCommand{\color{mediumgray} \vrule width 1pt \hspace{5pt}}% -%% \MakeFramed{\vspace{-0.5em}}} -%% {\unskip\endMakeFramed} - -% For now, make this a no-op... -\newenvironment{codecell}{} - - \newenvironment{codeinput}{% - \def\FrameCommand{\colorbox{inputbackground}}% - \MakeFramed{\advance\hsize-\width \FrameRestore}} - {\unskip\endMakeFramed} - -\newenvironment{codeoutput}{% - \def\FrameCommand{\colorbox{outputbackground}}% - \vspace{-1.4em} - \MakeFramed{\advance\hsize-\width \FrameRestore}} - {\unskip\medskip\endMakeFramed} - -\newenvironment{traceback}{% - \def\FrameCommand{\colorbox{traceback}}% - \MakeFramed{\advance\hsize-\width \FrameRestore}} - {\endMakeFramed} - -% Use and configure listings package for nicely formatted code -\usepackage{listingsutf8} -\lstset{ - language=python, - inputencoding=utf8x, - extendedchars=\true, - aboveskip=\smallskipamount, - belowskip=\smallskipamount, - xleftmargin=2mm, - breaklines=true, - basicstyle=\small \ttfamily, - showstringspaces=false, - keywordstyle=\color{blue}\bfseries, - commentstyle=\color{myteal}, - stringstyle=\color{darkgreen}, - identifierstyle=\color{darkorange}, - columns=fullflexible, % tighter character kerning, like verb -} - -% The hyperref package gives us a pdf with properly built -% internal navigation ('pdf bookmarks' for the table of contents, -% internal cross-reference links, web links for URLs, etc.) -\usepackage{hyperref} -\hypersetup{ - breaklinks=true, % so long urls are correctly broken across lines - colorlinks=true, - urlcolor=blue, - linkcolor=darkorange, - citecolor=darkgreen, - } - -% hardcode size of all verbatim environments to be a bit smaller -\makeatletter -\g@addto@macro\@verbatim\small\topsep=0.5em\partopsep=0pt -\makeatother - -% Prevent overflowing lines due to urls and other hard-to-break entities. -\sloppy - -\begin{document} - -\section{Python in HPC} - - -\subsection{Supercomputing 2012} - -Presenters: - - -\noindent {\bf Andy R. Terrel, PhD}\\ -Texas Advanced Computing Center\\ -University of -Texas at Austin\\[2em] - -\noindent {\bf Travis Oliphant, PhD}\\ -Continuum Analytics\\[2em] - -\noindent {\bf Aron Ahmadia, PhD}\\ -Supercomputing Laboratory\\ -King Abdullah University of Science and Technoglogy\\[2em] -\begin{center} - -\href{http://creativecommons.org/licenses/by/3.0/deed.en\_US}{\includegraphics{figures/creative_commons_logo.png}}\\[2em] - -\noindent Python in HPC Tutorial by Terrel, Oliphant, and Ahmadia is licensed -under a Creative Commons Attribution 3.0 Unported License. \\[2em] - -\href{http://www.tacc.utexas.edu}{\includegraphics[scale=0.8]{figures/TACC_logo.png}} \qquad -\href{http://www.continuum.io}{\includegraphics[scale=.3]{figures/continuum.png}} \qquad -\href{http://www.kaust.edu.sa/}{\includegraphics[scale=.3]{figures/kaust.png}} -\end{center} - -\newpage - -\subsection{Updated Tutorial} - -These presentation materials are being continuously updated as we refine -and improve our demonstrats. To get the latest version of this tutorial -you can: - -\begin{enumerate}[1)] -\item - Download a zip or tar ball from the - \href{https://github.com/aterrel/HPCPythonSC2012/tags}{github SC2012 - tag}: - - wget --no-check-certificate - https://github.com/aterrel/HPCPythonSC2012/zipball/SC2012 -\item - Checkout from git - - git clone https://github.com/aterrel/HPCPythonSC2012.git -\item - View the html version on - \href{http://nbviewer.ipython.org/urls/raw.github.com/aterrel/HPCPythonSC2012/master/02_Speeding_Python.ipynb}{nbviewer}. -\item - As a last resort, head to https://github.com/aterrel/HPCPythonSC2012 - for updated instructions (see the README at the bottom of the page). -\end{enumerate} - -\newpage -\subsection{Interacting with the Tutorial Slides} - -This tutorial is an interactive worksheet designed to encourage you to -try out the lessons during the demonstration. If you are looking at the -pdf version, we encourage you to download the updated version (see -previous slide) and try the interactive version. - -To run the interactive version, you need a good Python environment -including: - -\begin{itemize} -\item - IPython version \textgreater{}= 13.0 -\item - Numpy version \textgreater{}= 1.5 -\item - Scipy -\item - Matplotlib -\end{itemize} - -Move to the directory containing the tarball and execute: - -\begin{verbatim} -$ ipython notebook --pylab=inline -\end{verbatim} - -We heartily endorse the -\href{http://www.enthought.com/products/epd\_free.php}{Free Enthought -Python Distribution}. - -\newpage -\subsection{How Slow is Python} - -Let's add one to a million number - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -lst = range(1000000) # A pure Python list -%timeit [i + 1 for i in lst] # A Python list comprehension (iteration happens in C but with PyObjects) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -1 loops, best of 3: 208 ms per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{Why is Python Slow?} - -Dynamic typing requires lots of metadata around variable. - -\begin{itemize} -\item - Python uses heavy frame objects during iteration -\end{itemize} - -\subsubsection{Solution:} - -\begin{itemize} -\item - Make an object that has a single type and continuous storage. -\item - Implement common functionality into that object to iterate in C. -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -arr = arange(1000000) # A NumPy list of integers -%timeit arr + 1 # Use operator overloading for nice syntax, now iteration is in C with ints -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -100 loops, best of 3: 6.64 ms per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{What makes NumPy so much faster?} - -\begin{itemize} -\item - Data layout -\item - homogenous: every item takes up the same size block of memory -\item - single data-type objects -\item - powerful array scalar types -\item - universal function (ufuncs) -\item - function that operates on ndarrays in an element-by-element fashion -\item - vectorized wrapper for a function -\item - built-in functions are implemented in compiled C code -\end{itemize} - -\newpage -\subsection{NumPy Data layout} - -\begin{itemize} -\item - homogenous: every item takes up the same size block of memory -\item - single data-type objects -\item - powerful array scalar types -\end{itemize} - -\begin{figure}[htbp] -\centering -\includegraphics{figures/numpy/threefundamental.png} -\caption{three fundamental} -\end{figure} - -\newpage -\subsection{NumPy Universal Functions (ufuncs)} - -\begin{itemize} -\item - function that operates on ndarrays in an element-by-element fashion -\item - vectorized wrapper for a function -\item - built-in functions are implemented in compiled C code -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit [sin(i)**2 for i in arr] - -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -1 loops, best of 3: 11.6 s per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit np.sin(arr)**2 -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -10 loops, best of 3: 44.8 ms per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{Other NumPy features to be aware of} - -\begin{itemize} -\item - Reshaping -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -arr2 = arr.reshape((10,100000)) -print(arr2) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -[[ 0 1 2 ..., 99997 99998 99999] - [100000 100001 100002 ..., 199997 199998 199999] - [200000 200001 200002 ..., 299997 299998 299999] - ..., - [700000 700001 700002 ..., 799997 799998 799999] - [800000 800001 800002 ..., 899997 899998 899999] - [900000 900001 900002 ..., 999997 999998 999999]] -\end{verbatim} -\end{codeoutput} -\end{codecell} -\begin{itemize} -\item - Memory Views -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -arr2.view? -\end{lstlisting} -\end{codeinput} -\end{codecell} -\begin{itemize} -\item - Index Slicing -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -x = np.arange(0, 20, 2); y = x**2 -((y[1:] - y[:-1]) / (x[1:] - x[:-1])) # dy/dx -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -array([ 2, 6, 10, 14, 18, 22, 26, 30, 34]) -\end{verbatim} -\end{codeoutput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -((y[2:] - y[:-2])/(x[2:] - x[:-2])) # d^2y/dx^2 via center differencing -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -array([ 4, 8, 12, 16, 20, 24, 28, 32]) -\end{verbatim} -\end{codeoutput} -\end{codecell} -\begin{itemize} -\item - Fancy Indexing -\end{itemize} - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -evens = arr[arr%2 == 0] -print(evens) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -[ 0 2 4 ..., 999994 999996 999998] -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{Compiling to C} - -C is faster, and Python is easier to write. We want both! - -\subsection{Cython} - -\begin{itemize} -\item - a programming language based on Python -\item - uses extra syntax allowing for optional static type declarations -\item - source code gets translated into optimized C/C++ code and compiled as - Python extension modules -\end{itemize} - -\newpage -\subsection{Using Cython in IPython} - -In IPython we can make any cell call out to Cython via the cell magic - -First load the extension - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%load_ext cythonmagic -\end{lstlisting} -\end{codeinput} -\end{codecell} -Now use \texttt{\%\%cython} at the begining of a code cell to call out -to Cython. - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%%cython -def f_cython(int i): - return i**4 + 3*i**2 + 10 -\end{lstlisting} -\end{codeinput} -\end{codecell} -Now use Cython function in code: - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -f(100) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -100030010 -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{How much faster is Cython?} - -The more you are able to provide type information the better the -compile. For example f without type information: - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%%cython -def f_slow(i): - return i**4 + 3*i**2 + 10 -\end{lstlisting} -\end{codeinput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit f_slow(100) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -1000000 loops, best of 3: 248 ns per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit f_cython(100) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -10000000 loops, best of 3: 117 ns per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{Declaring Cython variables for C level} - -If you use a variable or function only at the Cython level you can keep -it in C via \texttt{cdef}: - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%%cython -cdef f(double x): - return x**2-x - -def integrate_f(double a, double b, int N): - cdef int i - cdef double s, dx - s = 0 - dx = (b-a)/N - for i in range(N): - s += f(a+i*dx) - return s * dx -\end{lstlisting} -\end{codeinput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit integrate_f(1.0, 2.0, 1000) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -10000 loops, best of 3: 37.4 us per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -The pure Python version: - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -def f(x): - return x**2-x - -def integrate_f(a, b, N): - s = 0 - dx = (b-a)/N - for i in range(N): - s += f(a+i*dx) - return s * dx -\end{lstlisting} -\end{codeinput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%timeit integrate_f(1.0, 2.0, 1000) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -1000 loops, best of 3: 530 us per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\newpage -\subsection{Using NumPy with Cython} - -You can also use fast accessors to NumPy arrays from Cython: - -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -%%cython -import numpy as np -# "cimport" is used to import special compile-time information -# about the numpy module (this is stored in a file numpy.pxd which is -# currently part of the Cython distribution). -cimport numpy as np -# We now need to fix a datatype for our arrays. I've used the variable -# DTYPE for this, which is assigned to the usual NumPy runtime -# type info object. -DTYPE = np.int -# "ctypedef" assigns a corresponding compile-time type to DTYPE_t. For -# every type in the numpy module there's a corresponding compile-time -# type with a _t-suffix. -ctypedef np.int_t DTYPE_t -# "def" can type its arguments but not have a return type. The type of the -# arguments for a "def" function is checked at run-time when entering the -# function. -# -# The arrays f, g and h is typed as "np.ndarray" instances. The only effect -# this has is to a) insert checks that the function arguments really are -# NumPy arrays, and b) make some attribute access like f.shape[0] much -# more efficient. (In this example this doesn't matter though.) -def naive_convolve(np.ndarray f, np.ndarray g): - if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1: - raise ValueError("Only odd dimensions on filter supported") - assert f.dtype == DTYPE and g.dtype == DTYPE - # The "cdef" keyword is also used within functions to type variables. It - # can only be used at the top indendation level (there are non-trivial - # problems with allowing them in other places, though we'd love to see - # good and thought out proposals for it). - # - # For the indices, the "int" type is used. This corresponds to a C int, - # other C types (like "unsigned int") could have been used instead. - # Purists could use "Py_ssize_t" which is the proper Python type for - # array indices. - cdef int vmax = f.shape[0] - cdef int wmax = f.shape[1] - cdef int smax = g.shape[0] - cdef int tmax = g.shape[1] - cdef int smid = smax // 2 - cdef int tmid = tmax // 2 - cdef int xmax = vmax + 2*smid - cdef int ymax = wmax + 2*tmid - cdef np.ndarray h = np.zeros([xmax, ymax], dtype=DTYPE) - cdef int x, y, s, t, v, w - # It is very important to type ALL your variables. You do not get any - # warnings if not, only much slower code (they are implicitly typed as - # Python objects). - cdef int s_from, s_to, t_from, t_to - # For the value variable, we want to use the same data type as is - # stored in the array, so we use "DTYPE_t" as defined above. - # NB! An important side-effect of this is that if "value" overflows its - # datatype size, it will simply wrap around like in C, rather than raise - # an error like in Python. - cdef DTYPE_t value - for x in range(xmax): - for y in range(ymax): - s_from = max(smid - x, -smid) - s_to = min((xmax - x) - smid, smid + 1) - t_from = max(tmid - y, -tmid) - t_to = min((ymax - y) - tmid, tmid + 1) - value = 0 - for s in range(s_from, s_to): - for t in range(t_from, t_to): - v = x - smid + s - w = y - tmid + t - value += g[smid - s, tmid - t] * f[v, w] - h[x, y] = value - return h -\end{lstlisting} -\end{codeinput} -\end{codecell} -\begin{codecell} -\begin{codeinput} -\begin{lstlisting} -N=100 -f = np.arange(N*N, dtype=np.int).reshape((N,N)) -g = np.arange(81, dtype=np.int).reshape((9, 9)) -%timeit -n2 -r3 naive_convolve(f, g) -\end{lstlisting} -\end{codeinput} -\begin{codeoutput} -\begin{verbatim} -2 loops, best of 3: 1.52 s per loop -\end{verbatim} -\end{codeoutput} -\end{codecell} -\end{document} diff --git a/pdf/python-speed-cython-sc14.pdf b/pdf/python-speed-cython-sc14.pdf new file mode 100644 index 0000000..b851a47 Binary files /dev/null and b/pdf/python-speed-cython-sc14.pdf differ