diff --git a/.gitignore b/.gitignore index bb14e0e..6a5bebb 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.ipynb_checkpoints/ .DS_Store *.DS_Store *.pyc diff --git a/.ipynb_checkpoints/not_so_obvious_python_stuff-checkpoint.ipynb b/.ipynb_checkpoints/not_so_obvious_python_stuff-checkpoint.ipynb deleted file mode 100644 index 3ea736a..0000000 --- a/.ipynb_checkpoints/not_so_obvious_python_stuff-checkpoint.ipynb +++ /dev/null @@ -1,3160 +0,0 @@ -{ - "metadata": { - "name": "", - "signature": "sha256:9a07a78204a51f0faab65e52657f0446cd604ed470627f9c6af1ba74c047fe23" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sebastian Raschka \n", - "last updated: 04/27/2014 ([Changelog](#changelog))\n", - "\n", - "[Link to this IPython Notebook on GitHub](https://github.com/rasbt/python_reference/blob/master/not_so_obvious_python_stuff.ipynb)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### All code was executed in Python 3.4" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A collection of not-so-obvious Python stuff you should know!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "A = np.array([ [1,2,3], [4,5,6], [7,8,9] ])\n", - ">>> A\n", - "array([[1, 2, 3],\n", - " [4, 5, 6],\n", - " [7, 8, 9]])\n", - "\n", - "I want my result to be:\n", - "
\n", - "array([[1],\n", - " [4],\n", - " [7]])\n", - "\n", - "with `.shape` = `(3,1)`\n", - "\n", - "\n", - "However, the default behavior of numpy is to return the column as a row vector:\n", - "\n", - "
\n", - ">>> A[:,0]\n", - "array([1, 4, 7])\n", - ">>> A[:,0].shape\n", - "(3,)\n", - "" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np\n", - "\n", - "# 1st column, e.g., A[:,0,np.newaxis]\n", - "\n", - "def colvec_method1(A):\n", - " for col in A.T:\n", - " colvec = row[:,np.newaxis]\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 83 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., A[:,0:1]\n", - "\n", - "def colvec_method2(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = A[:,idx:idx+1]\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 82 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., A[:,0].reshape(-1,1)\n", - "\n", - "def colvec_method3(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = A[:,idx].reshape(-1,1)\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 81 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., np.vstack(A[:,0]\n", - "\n", - "def colvec_method4(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = np.vstack(A[:,idx])\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 79 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., np.row_stack(A[:,0])\n", - "\n", - "def colvec_method5(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = np.row_stack(A[:,idx])\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 77 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., np.column_stack((A[:,0],))\n", - "\n", - "def colvec_method6(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = np.column_stack((A[:,idx],))\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 74 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 1st column, e.g., A[:,[0]]\n", - "\n", - "def colvec_method7(A):\n", - " for idx in range(A.shape[1]):\n", - " colvec = A[:,[idx]]\n", - " yield colvec" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 89 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def test_method(method, A):\n", - " for i in method(A): \n", - " assert i.shape == (A.shape[0],1), \"{}, {}\".format(i.shape, A.shape[0],1)" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 69 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import timeit\n", - "\n", - "A = np.random.random((300, 3))\n", - "\n", - "for method in [\n", - " colvec_method1, colvec_method2, \n", - " colvec_method3, colvec_method4, \n", - " colvec_method5, colvec_method6,\n", - " colvec_method7]:\n", - " print('\\nTest:', method.__name__)\n", - " %timeit test_method(colvec_method2, A)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "Test: colvec_method1\n", - "100000 loops, best of 3: 16.6 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method2\n", - "10000 loops, best of 3: 16.1 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method3\n", - "100000 loops, best of 3: 16.2 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method4\n", - "100000 loops, best of 3: 16.4 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method5\n", - "100000 loops, best of 3: 16.2 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method6\n", - "100000 loops, best of 3: 16.8 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Test: colvec_method7\n", - "100000 loops, best of 3: 16.3 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n" - ] - } - ], - "prompt_number": 91 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "
-###Links to view the IPython Notebooks
+
-- [Python benchmarks via `timeit`](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/benchmarks/timeit_tests.ipynb?create=1)
-- [Benchmarks of different palindrome functions](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/benchmarks/palindrome_timeit.ipynb?create=1)
-- [A collection of not so obvious Python stuff you should know!](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/not_so_obvious_python_stuff.ipynb?create=1)
-- [Python's scope resolution for variable names and the LEGB rule](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/scope_resolution_legb_rule.ipynb?create=1)
+- [// Python tips and tutorials](#-python-tips-and-tutorials)
+- [// Python and the web](#-python-and-the-web)
+- [// Algorithms](#-algorithms)
+- [// Plotting and Visualization](#-plotting-and-visualization)
+- [// Benchmarks](#-benchmarks)
+- [// Python and "Data Science"](#-python-and-data-science)
+- [// Useful scripts and snippets](#-useful-scripts-and-snippets)
+- [// Other](#-other)
+- [// Links](#-links)
-### Links to Markdown files
-- [A thorough guide to SQLite database operations in Python](./sqlite3_howto/README.md)
-- [Unit testing in Python - Why we want to make it a habit](./tutorials/unit_testing.md)
-- [Installing Scientific Packages for Python3 on MacOS 10.9 Mavericks](./tutorials/installing_scientific_packages.md)
+
+
+
+
+
+Python tips and tutorials [back to top]
+
+- A collection of not so obvious Python stuff you should know! [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/not_so_obvious_python_stuff.ipynb?create=1)]
+
+- Python's scope resolution for variable names and the LEGB rule [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/scope_resolution_legb_rule.ipynb?create=1)]
+
+- Key differences between Python 2.x and Python 3.x [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/key_differences_between_python_2_and_3.ipynb?create=1)]
+
+- A thorough guide to SQLite database operations in Python [[Markdown](./tutorials/sqlite3_howto/README.md)]
+
+- Unit testing in Python - Why we want to make it a habit [[Markdown](./tutorials/unit_testing.md)]
+
+- Installing Scientific Packages for Python3 on MacOS 10.9 Mavericks [[Markdown](./tutorials/installing_scientific_packages.md)]
+
+- Sorting CSV files using the Python csv module [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/sorting_csvs.ipynb)]
+
+- Using Cython with and without IPython magic [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/running_cython.ipynb)]
+
+- Parallel processing via the multiprocessing module [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/multiprocessing_intro.ipynb?create=1)]
+
+- Entry point: Data - using sci-packages to prepare data for Machine Learning tasks and other data analyses [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/python_data_entry_point.ipynb?create=1)]
+
+- Awesome things that you can do in IPython Notebooks (in progress) [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/awesome_things_ipynb.ipynb)]
+
+- A collection of useful regular expressions [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb)]
+
+- Quick guide for dealing with missing numbers in NumPy [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/numpy_nan_quickguide.ipynb)]
+
+- A random collection of useful Python snippets [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/python_patterns/patterns.ipynb)]
+
+- Things in pandas I wish I'd had known earlier [[IPython nb](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb)]
+
+
+
+| \n", + " | a | \n", + "b | \n", + "c | \n", + "d | \n", + "
|---|---|---|---|---|
| 995 | \n", + "995 | \n", + "995 | \n", + "995 | \n", + "995 | \n", + "
| 996 | \n", + "996 | \n", + "996 | \n", + "996 | \n", + "996 | \n", + "
| 997 | \n", + "997 | \n", + "997 | \n", + "997 | \n", + "997 | \n", + "
| 998 | \n", + "998 | \n", + "998 | \n", + "998 | \n", + "998 | \n", + "
| 999 | \n", + "999 | \n", + "999 | \n", + "999 | \n", + "999 | \n", + "
%watermark [-a AUTHOR] [-d] [-e] [-n] [-t] [-z] [-u] [-c CUSTOM_TIME]\n", + " [-v] [-p PACKAGES] [-h] [-m] [-g] [-w]\n", + "\n", + " \n", + "IPython magic function to print date/time stamps \n", + "and various system information.\n", + "\n", + "watermark version 1.2.1\n", + "\n", + "optional arguments:\n", + " -a AUTHOR, --author AUTHOR\n", + " prints author name\n", + " -d, --date prints current date as MM/DD/YYYY\n", + " -e, --eurodate prints current date as DD/MM/YYYY\n", + " -n, --datename prints date with abbrv. day and month names\n", + " -t, --time prints current time\n", + " -z, --timezone appends the local time zone\n", + " -u, --updated appends a string \"Last updated: \"\n", + " -c CUSTOM_TIME, --custom_time CUSTOM_TIME\n", + " prints a valid strftime() string\n", + " -v, --python prints Python and IPython version\n", + " -p PACKAGES, --packages PACKAGES\n", + " prints versions of specified Python modules and\n", + " packages\n", + " -h, --hostname prints the host name\n", + " -m, --machine prints system and machine info\n", + " -g, --githash prints current Git commit hash\n", + " -w, --watermark prints the current version of watermark\n", + "File: ~/.ipython/extensions/watermark.py\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Sebastian Raschka, 03/2014
Code was executed in Python 3.4.0
True and False in the datetime modulePointed out in a nice article "A false midnight" at http://lwn.net/SubscriberLink/590299/bf73fe823974acea/:
-"it often comes as a big surprise for programmers to find (sometimes by way of a hard-to-reproduce bug) that,
unlike any other time value, midnight (i.e. datetime.time(0,0,0)) is False.
A long discussion on the python-ideas mailing list shows that, while surprising,
that behavior is desirable—at least in some quarters."
import datetime
-
-print('"datetime.time(0,0,0)" (Midnight) evaluates to', bool(datetime.time(0,0,0)))
-
-print('"datetime.time(1,0,0)" (1 am) evaluates to', bool(datetime.time(1,0,0)))
-Truemy_true_val = True
-
-
-print('my_true_val == True:', my_true_val == True)
-print('my_true_val is True:', my_true_val is True)
-
-print('my_true_val == None:', my_true_val == None)
-print('my_true_val is None:', my_true_val is None)
-
-print('my_true_val == False:', my_true_val == False)
-print('my_true_val is False:', my_true_val is False)
-
-print(my_true_val
-if my_true_val:
- print('"if my_true_val:" is True')
-else:
- print('"if my_true_val:" is False')
-
-if not my_true_val:
- print('"if not my_true_val:" is True')
-else:
- print('"if not my_true_val:" is False')
-Falsemy_false_val = False
-
-
-print('my_false_val == True:', my_false_val == True)
-print('my_false_val is True:', my_false_val is True)
-
-print('my_false_val == None:', my_false_val == None)
-print('my_false_val is None:', my_false_val is None)
-
-print('my_false_val == False:', my_false_val == False)
-print('my_false_val is False:', my_false_val is False)
-
-
-if my_false_val:
- print('"if my_false_val:" is True')
-else:
- print('"if my_false_val:" is False')
-
-if not my_false_val:
- print('"if not my_false_val:" is True')
-else:
- print('"if not my_false_val:" is False')
-None 'value'my_none_var = None
-
-print('my_none_var == True:', my_none_var == True)
-print('my_none_var is True:', my_none_var is True)
-
-print('my_none_var == None:', my_none_var == None)
-print('my_none_var is None:', my_none_var is None)
-
-print('my_none_var == False:', my_none_var == False)
-print('my_none_var is False:', my_none_var is False)
-
-
-if my_none_var:
- print('"if my_none_var:" is True')
-else:
- print('"if my_none_var:" is False')
-
-if not my_none_var:
- print('"if not my_none_var:" is True')
-else:
- print('"if not my_none_var:" is False')
-my_empty_string = ""
-
-print('my_empty_string == True:', my_empty_string == True)
-print('my_empty_string is True:', my_empty_string is True)
-
-print('my_empty_string == None:', my_empty_string == None)
-print('my_empty_string is None:', my_empty_string is None)
-
-print('my_empty_string == False:', my_empty_string == False)
-print('my_empty_string is False:', my_empty_string is False)
-
-
-if my_empty_string:
- print('"if my_empty_string:" is True')
-else:
- print('"if my_empty_string:" is False')
-
-if not my_empty_string:
- print('"if not my_empty_string:" is True')
-else:
- print('"if not my_empty_string:" is False')
-It is generally not a good idea to use the == to check for empty lists...
my_empty_list = []
-
-
-print('my_empty_list == True:', my_empty_list == True)
-print('my_empty_list is True:', my_empty_list is True)
-
-print('my_empty_list == None:', my_empty_list == None)
-print('my_empty_list is None:', my_empty_list is None)
-
-print('my_empty_list == False:', my_empty_list == False)
-print('my_empty_list is False:', my_empty_list is False)
-
-
-if my_empty_list:
- print('"if my_empty_list:" is True')
-else:
- print('"if my_empty_list:" is False')
-
-if not my_empty_list:
- print('"if not my_empty_list:" is True')
-else:
- print('"if not my_empty_list:" is False')
-
-
-
-my_zero_list = [0]
-
-
-print('my_zero_list == True:', my_zero_list == True)
-print('my_zero_list is True:', my_zero_list is True)
-
-print('my_zero_list == None:', my_zero_list == None)
-print('my_zero_list is None:', my_zero_list is None)
-
-print('my_zero_list == False:', my_zero_list == False)
-print('my_zero_list is False:', my_zero_list is False)
-
-
-if my_zero_list:
- print('"if my_zero_list:" is True')
-else:
- print('"if my_zero_list:" is False')
-
-if not my_zero_list:
- print('"if not my_zero_list:" is True')
-else:
- print('"if not my_zero_list:" is False')
-List comparisons are a handy way to show the difference between == and is.
While == is rather evaluating the equality of the value, is is checking if two objects are equal. The examples below show that we can assign a pointer to the same list object by using =, e.g., list1 = list2.
a) If we want to make a shallow copy of the list values, we have to make a little tweak: list1 = list2[:], or
b) a deepcopy via list1 = copy.deepcopy(list2)
Possibly the best explanation of shallow vs. deep copies I've read so far:
-*** "Shallow copies duplicate as little as possible. A shallow copy of a collection is a copy of the collection structure, not the elements. With a shallow copy, two collections now share the individual elements. Deep copies duplicate everything. A deep copy of a collection is two collections with all of the elements in the original collection duplicated."***
-(via S.Lott on StackOverflow)
-List modification of the original list doesn't affect
shallow copies or deep copies if the list contains literals.
from copy import deepcopy
-
-my_first_list = [1]
-my_second_list = [1]
-print('my_first_list == my_second_list:', my_first_list == my_second_list)
-print('my_first_list is my_second_list:', my_first_list is my_second_list)
-
-my_third_list = my_first_list
-print('my_first_list == my_third_list:', my_first_list == my_third_list)
-print('my_first_list is my_third_list:', my_first_list is my_third_list)
-
-my_shallow_copy = my_first_list[:]
-print('my_first_list == my_shallow_copy:', my_first_list == my_shallow_copy)
-print('my_first_list is my_shallow_copy:', my_first_list is my_shallow_copy)
-
-my_deep_copy = deepcopy(my_first_list)
-print('my_first_list == my_deep_copy:', my_first_list == my_deep_copy)
-print('my_first_list is my_deep_copy:', my_first_list is my_deep_copy)
-
-print('\nmy_third_list:', my_third_list)
-print('my_shallow_copy:', my_shallow_copy)
-print('my_deep_copy:', my_deep_copy)
-
-my_first_list[0] = 2
-print('after setting "my_first_list[0] = 2"')
-print('my_third_list:', my_third_list)
-print('my_shallow_copy:', my_shallow_copy)
-print('my_deep_copy:', my_deep_copy)
-List modification of the original list does affect
shallow copies, but not deep copies if the list contains compound objects.
my_first_list = [[1],[2]]
-my_second_list = [[1],[2]]
-print('my_first_list == my_second_list:', my_first_list == my_second_list)
-print('my_first_list is my_second_list:', my_first_list is my_second_list)
-
-my_third_list = my_first_list
-print('my_first_list == my_third_list:', my_first_list == my_third_list)
-print('my_first_list is my_third_list:', my_first_list is my_third_list)
-
-my_shallow_copy = my_first_list[:]
-print('my_first_list == my_shallow_copy:', my_first_list == my_shallow_copy)
-print('my_first_list is my_shallow_copy:', my_first_list is my_shallow_copy)
-
-my_deep_copy = deepcopy(my_first_list)
-print('my_first_list == my_deep_copy:', my_first_list == my_deep_copy)
-print('my_first_list is my_deep_copy:', my_first_list is my_deep_copy)
-
-print('\nmy_third_list:', my_third_list)
-print('my_shallow_copy:', my_shallow_copy)
-print('my_deep_copy:', my_deep_copy)
-
-my_first_list[0][0] = 2
-print('after setting "my_first_list[0][0] = 2"')
-print('my_third_list:', my_third_list)
-print('my_shallow_copy:', my_shallow_copy)
-print('my_deep_copy:', my_deep_copy)
-a = 1
-b = 1
-print('a is b', bool(a is b))
-True
-
-a = 999
-b = 999
-print('a is b', bool(a is b))
-| feature | \n", + "optional in | \n", + "mandatory in | \n", + "effect | \n", + "
|---|---|---|---|
| nested_scopes | \n", + "2.1.0b1 | \n", + "2.2 | \n", + "PEP 227:\n", + "Statically Nested Scopes | \n", + "
| generators | \n", + "2.2.0a1 | \n", + "2.3 | \n", + "PEP 255:\n", + "Simple Generators | \n", + "
| division | \n", + "2.2.0a2 | \n", + "3.0 | \n", + "PEP 238:\n", + "Changing the Division Operator | \n", + "
| absolute_import | \n", + "2.5.0a1 | \n", + "3.0 | \n", + "PEP 328:\n", + "Imports: Multi-Line and Absolute/Relative | \n", + "
| with_statement | \n", + "2.5.0a1 | \n", + "2.6 | \n", + "PEP 343:\n", + "The “with” Statement | \n", + "
| print_function | \n", + "2.6.0a2 | \n", + "3.0 | \n", + "PEP 3105:\n", + "Make print a function | \n", + "
| unicode_literals | \n", + "2.6.0a2 | \n", + "3.0 | \n", + "PEP 3112:\n", + "Bytes literals in Python 3000 | \n", + "
Python 2.7.6 \n", + "[GCC 4.0.1 (Apple Inc. build 5493)] on darwin\n", + "Type "help", "copyright", "credits" or "license" for more information.\n", + "\n", + ">>> my_input = input('enter a number: ')\n", + "\n", + "enter a number: 123\n", + "\n", + ">>> type(my_input)\n", + "<type 'int'>\n", + "\n", + ">>> my_input = raw_input('enter a number: ')\n", + "\n", + "enter a number: 123\n", + "\n", + ">>> type(my_input)\n", + "<type 'str'>\n", + "
Python 3.4.1 \n", + "[GCC 4.2.1 (Apple Inc. build 5577)] on darwin\n", + "Type "help", "copyright", "credits" or "license" for more information.\n", + "\n", + ">>> my_input = input('enter a number: ')\n", + "\n", + "enter a number: 123\n", + "\n", + ">>> type(my_input)\n", + "<class 'str'>\n", + "
+
+##This is a test
+
+Code blocks must be indented by 4 whitespaces.
+Python-Markdown has a auto-guess function which works
+pretty well:
+
+ print("Hello, World")
+ # some comment
+ for letter in "this is a test":
+ print(letter)
+
+In cases where Python-Markdown has problems figuring out which
+programming language we use, we can also add the language-tag
+explicitly. One way to do this would be:
+
+ :::python
+ print("Hello, World")
+
+or we can highlight certain lines to
+draw the reader's attention:
+
+ :::python hl_lines="1 5"
+ print("highlight me!")
+ # but not me!
+ for letter in "this is a test":
+ print(letter)
+ # I want to be highlighted, too!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <-- converted HTML contents go here
+
+
+
+
+
+
+If we open our [**final.html**](https://github.com/rasbt/python_reference/blob/master/tutorials/markdown_syntax_highlighting/template.html) file in our web browser now, we can the pretty Python syntax highlighting.
+
+
+
+Code blocks must be indented by 4 whitespaces. +Python-Markdown has a auto-guess function which works +pretty well:
+print("Hello, World") +# some comment +for letter in "this is a test": + print(letter) +
In cases where Python-Markdown has problems figuring out which +programming language we use, we can also add the language-tag +explicitly. One way to do this would be:
+print("Hello, World") +
or we can highlight certain lines to +draw the reader's attention:
+print("highlight me!") +# but not me! +for letter in "this is a test": + print(letter) +# I want to be highlighted, too! +
Code blocks must be indented by 4 whitespaces. +Python-Markdown has a auto-guess function which works +pretty well:
+print("Hello, World") +# some comment +for letter in "this is a test": + print(letter) +
In cases where Python-Markdown has problems figuring out which +programming language we use, we can also add the language-tag +explicitly. One way to do this would be:
+print("Hello, World") +
or we can highlight certain lines to +draw the reader's attention:
+print("highlight me!") +# but not me! +for letter in "this is a test": + print(letter) +# I want to be highlighted, too! +
|
+ Task + |
+
+ MATLAB/Octave + |
+
+ Python + NumPy + |
+
+ R + |
+
+ Julia + |
+
+ Task + |
+
|
+ CREATING + MATRICES + |
+ |||||
|
+ Creating
+ Matrices |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Creating
+ Matrices |
+
|
+ Creating + an 1D column vector + |
+
+ M>
+ a = [1; 2; 3] |
+
+ P> + a + = + np.array([1,2,3]).reshape(1,3) +
|
+
+ R>
+ a = matrix(c(1,2,3), nrow=3, byrow=T) |
+
+ J>
+ a=[1; 2; 3] |
+
+ Creating + an 1D column vector + |
+
|
+ Creating
+ an |
+
+ M>
+ b = [1 2 3] |
+
+ P>
+ b = np.array([1,2,3]) #
+ note that numpy doesn't have P> + b.shape +(3,) +
|
+
+ R>
+ b = matrix(c(1,2,3), ncol=3) |
+
+ J>
+ b=[1 2 3] |
+
+ Creating
+ an |
+
|
+ Creating
+ a |
+
+ M>
+ rand(3,2) |
+
+ P>
+ np.random.rand(3,2) |
+
+ R>
+ matrix(runif(3*2), ncol=2) |
+
+ J>
+ rand(3,2) |
+
+ Creating
+ a |
+
|
+ Creating
+ a |
+
+ M>
+ zeros(3,2) |
+
+ P>
+ np.zeros((3,2)) |
+
+ R>
+ mat.or.vec(3, 2) |
+
+ J>
+ zeros(3,2) |
+
+ Creating
+ a |
+
|
+ Creating
+ an |
+
+ M>
+ ones(3,2) |
+
+ P>
+ np.ones((3,2)) |
+
+ R>
+ mat.or.vec(3, 2) + 1 |
+
+ J>
+ ones(3,2) |
+
+ Creating
+ an |
+
|
+ Creating
+ an |
+
+ M>
+ eye(3) |
+
+ P>
+ np.eye(3) |
+
+ R>
+ diag(3) |
+
+ J>
+ eye(3) |
+
+ Creating
+ an |
+
|
+ Creating
+ a |
+
+ M>
+ a = [1 2 3] |
+
+ P>
+ a = np.array([1,2,3]) |
+
+ R>
+ diag(1:3) |
+
+ J>
+ a=[1, 2, 3] |
+
+ Creating
+ a |
+
|
+ ACCESSING + MATRIX ELEMENTS + |
+ |||||
|
+ Getting
+ the dimension |
+
+ M>
+ A = [1 2 3; 4 5 6] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6] ]) |
+
+ R>
+ A = matrix(1:6,nrow=2,byrow=T) R>
+ dim(A) |
+
+ J>
+ A=[1 2 3; 4 5 6] |
+
+ Getting
+ the dimension |
+
|
+ Selecting + rows + |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Selecting + rows + |
+
|
+ Selecting + columns + |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Selecting + columns + |
+
|
+ Extracting
+ rows and columns by criteria |
+
+ M>
+ A = [1 2 3; 4 5 9; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,9], [7,8,9]]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 9; 7 8 9] |
+
+ Extracting
+ rows and columns by criteria |
+
|
+ Accessing
+ elements |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(c(1,2,3,4,5,9,7,8,9),nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Accessing
+ elements |
+
|
+ MANIPULATING + SHAPE AND DIMENSIONS + |
+ |||||
|
+ Converting |
+
+ M>
+ b = [1 2 3]
|
+
+ P>
+ b = np.array([1, 2, 3]) |
+
+ R>
+ b = matrix(c(1,2,3), ncol=3) |
+
+ J>
+ b=vec([1 2 3]) |
+
+ Converting |
+
|
+ Reshaping
+ Matrices |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([[1,2,3],[4,5,6],[7,8,9]]) P>
+ B = A.reshape(1, total_elements) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Reshaping
+ Matrices |
+
|
+ Concatenating + matrices + |
+
+ M>
+ A = [1 2 3; 4 5 6] |
+
+ P>
+ A = np.array([[1, 2, 3], [4, 5, 6]]) |
+
+ R>
+ A = matrix(1:6,nrow=2,byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6]; |
+
+ Concatenating + matrices + |
+
|
+ Stacking |
+
+ M>
+ a = [1 2 3] |
+
+ P>
+ a = np.array([1,2,3]) |
+
+ R>
+ a = matrix(1:3, ncol=3) |
+
+ J>
+ a=[1 2 3]; |
+
+ Stacking |
+
|
+ BASIC + MATRIX OPERATIONS + |
+ |||||
|
+ Matrix-scalar |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) #
+ Note that NumPy was optimized for |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) R>
+ A + 2 |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-scalar |
+
|
+ Matrix-matrix |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-matrix |
+
|
+ Matrix-vector |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, ncol=3) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-vector |
+
|
+ Element-wise |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) #
+ Note that NumPy was optimized for |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Element-wise |
+
|
+ Matrix
+ elements to power n |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix
+ elements to power n |
+
|
+ Matrix
+ to power n |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, ncol=3) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix
+ to power n |
+
|
+ Matrix + transpose + |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Matrix + transpose + |
+
|
+ Determinant
+ of a matrix: |
+
+ M>
+ A = [6 1 1; 4 -2 5; 2 8 7] |
+
+ P> A
+ = np.array([[6,1,1],[4,-2,5],[2,8,7]]) |
+
+ R>
+ A = matrix(c(6,1,1,4,-2,5,2,8,7), nrow=3, byrow=T) |
+
+ J>
+ A=[6 1 1; 4 -2 5; 2 8 7] |
+
+ Determinant
+ of a matrix: |
+
|
+ Inverse + of a matrix + |
+
+ M>
+ A = [4 7; 2 6] |
+
+ P>
+ A = np.array([[4, 7], [2, 6]]) |
+
+ R>
+ A = matrix(c(4,7,2,6), nrow=2, byrow=T) |
+
+ J>
+ A=[4 7; 2 6] |
+
+ Inverse + of a matrix + |
+
|
+ ADVANCED + MATRIX OPERATIONS + |
+ |||||
|
+ Calculating
+ the covariance matrix |
+
+ M>
+ x1 = [4.0000 4.2000 3.9000 4.3000 4.1000]’ |
+
+ P>
+ x1 = np.array([ 4, 4.2, 3.9, 4.3, 4.1]) |
+
+ R>
+ x1 = matrix(c(4, 4.2, 3.9, 4.3, 4.1), ncol=5) |
+
+ J>
+ x1=[4.0 4.2 3.9 4.3 4.1]'; |
+
+ Calculating
+ the covariance matrix |
+
|
+ Calculating |
+
+ M>
+ A = [3 1; 1 3] |
+
+ P>
+ A = np.array([[3, 1], [1, 3]]) |
+
+ R>
+ A = matrix(c(3,1,1,3), ncol=2) |
+
+ J>
+ A=[3 1; 1 3] |
+
+ Calculating |
+
|
+ Generating
+ a Gaussian dataset: |
+
+ %
+ requires statistics toolbox package |
+
+ P>
+ mean = np.array([0,0]) |
+
+ #
+ requires the ‘mass’ package |
+
+ #
+ requires the Distributions package from
+ https://github.com/JuliaStats/Distributions.jl |
+
+ Generating
+ a Gaussian dataset: |
+
* " operator would perform a matrix-matrix multiplication of NumPy matrices - same operator performs element-wise multiplication on NumPy arrays.
+Vice versa, the "`.dot()`" method is used for element-wise multiplication of NumPy matrices, wheras the equivalent operation would for NumPy arrays would be achieved via the " * "-operator.
+**Most people recommend the usage of the NumPy array type over NumPy matrices, since arrays are what most of the NumPy functions return.**
\ No newline at end of file
diff --git a/tutorials/matrix_cheatsheet_only.html b/tutorials/matrix_cheatsheet_only.html
new file mode 100644
index 0000000..8d9762c
--- /dev/null
+++ b/tutorials/matrix_cheatsheet_only.html
@@ -0,0 +1,1206 @@
+
+
+
+
+ |
+ Task + |
+
+ MATLAB/Octave + |
+
+ Python + NumPy + |
+
+ R + |
+
+ Julia + |
+
+ Task + |
+
|
+ CREATING + MATRICES + |
+ |||||
|
+ Creating
+ Matrices |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Creating
+ Matrices |
+
|
+ Creating + an 1D column vector + |
+
+ M>
+ a = [1; 2; 3] |
+
+ P> + a + = + np.array([1,2,3]).reshape(1,3) +
|
+
+ R>
+ a = matrix(c(1,2,3), nrow=3, byrow=T) |
+
+ J>
+ a=[1; 2; 3] |
+
+ Creating + an 1D column vector + |
+
|
+ Creating
+ an |
+
+ M>
+ b = [1 2 3] |
+
+ P>
+ b = np.array([1,2,3]) #
+ note that numpy doesn't have P> + b.shape +(3,) +
|
+
+ R>
+ b = matrix(c(1,2,3), ncol=3) |
+
+ J>
+ b=[1 2 3] |
+
+ Creating
+ an |
+
|
+ Creating
+ a |
+
+ M>
+ rand(3,2) |
+
+ P>
+ np.random.rand(3,2) |
+
+ R>
+ matrix(runif(3*2), ncol=2) |
+
+ J>
+ rand(3,2) |
+
+ Creating
+ a |
+
|
+ Creating
+ a |
+
+ M>
+ zeros(3,2) |
+
+ P>
+ np.zeros((3,2)) |
+
+ R>
+ mat.or.vec(3, 2) |
+
+ J>
+ zeros(3,2) |
+
+ Creating
+ a |
+
|
+ Creating
+ an |
+
+ M>
+ ones(3,2) |
+
+ P>
+ np.ones((3,2)) |
+
+ R>
+ mat.or.vec(3, 2) + 1 |
+
+ J>
+ ones(3,2) |
+
+ Creating
+ an |
+
|
+ Creating
+ an |
+
+ M>
+ eye(3) |
+
+ P>
+ np.eye(3) |
+
+ R>
+ diag(3) |
+
+ J>
+ eye(3) |
+
+ Creating
+ an |
+
|
+ Creating
+ a |
+
+ M>
+ a = [1 2 3] |
+
+ P>
+ a = np.array([1,2,3]) |
+
+ R>
+ diag(1:3) |
+
+ J>
+ a=[1, 2, 3] |
+
+ Creating
+ a |
+
|
+ ACCESSING + MATRIX ELEMENTS + |
+ |||||
|
+ Getting
+ the dimension |
+
+ M>
+ A = [1 2 3; 4 5 6] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6] ]) |
+
+ R>
+ A = matrix(1:6,nrow=2,byrow=T) R>
+ dim(A) |
+
+ J>
+ A=[1 2 3; 4 5 6] |
+
+ Getting
+ the dimension |
+
|
+ Selecting + rows + |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Selecting + rows + |
+
|
+ Selecting + columns + |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Selecting + columns + |
+
|
+ Extracting
+ rows and columns by criteria |
+
+ M>
+ A = [1 2 3; 4 5 9; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,9], [7,8,9]]) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 9; 7 8 9] |
+
+ Extracting
+ rows and columns by criteria |
+
|
+ Accessing
+ elements |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(c(1,2,3,4,5,9,7,8,9),nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Accessing
+ elements |
+
|
+ MANIPULATING + SHAPE AND DIMENSIONS + |
+ |||||
|
+ Converting |
+
+ M>
+ b = [1 2 3]
|
+
+ P>
+ b = np.array([1, 2, 3]) |
+
+ R>
+ b = matrix(c(1,2,3), ncol=3) |
+
+ J>
+ b=vec([1 2 3]) |
+
+ Converting |
+
|
+ Reshaping
+ Matrices |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([[1,2,3],[4,5,6],[7,8,9]]) P>
+ B = A.reshape(1, total_elements) |
+
+ R>
+ A = matrix(1:9,nrow=3,byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Reshaping
+ Matrices |
+
|
+ Concatenating + matrices + |
+
+ M>
+ A = [1 2 3; 4 5 6] |
+
+ P>
+ A = np.array([[1, 2, 3], [4, 5, 6]]) |
+
+ R>
+ A = matrix(1:6,nrow=2,byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6]; |
+
+ Concatenating + matrices + |
+
|
+ Stacking |
+
+ M>
+ a = [1 2 3] |
+
+ P>
+ a = np.array([1,2,3]) |
+
+ R>
+ a = matrix(1:3, ncol=3) |
+
+ J>
+ a=[1 2 3]; |
+
+ Stacking |
+
|
+ BASIC + MATRIX OPERATIONS + |
+ |||||
|
+ Matrix-scalar |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) #
+ Note that NumPy was optimized for |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) R>
+ A + 2 |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-scalar |
+
|
+ Matrix-matrix |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-matrix |
+
|
+ Matrix-vector |
+
+ M>
+ A = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, ncol=3) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix-vector |
+
|
+ Element-wise |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) #
+ Note that NumPy was optimized for |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Element-wise |
+
|
+ Matrix
+ elements to power n |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix
+ elements to power n |
+
|
+ Matrix
+ to power n |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, ncol=3) |
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9]; |
+
+ Matrix
+ to power n |
+
|
+ Matrix + transpose + |
+
+ M> A
+ = [1 2 3; 4 5 6; 7 8 9] |
+
+ P>
+ A = np.array([ [1,2,3], [4,5,6], [7,8,9] ]) |
+
+ R>
+ A = matrix(1:9, nrow=3, byrow=T)
|
+
+ J>
+ A=[1 2 3; 4 5 6; 7 8 9] |
+
+ Matrix + transpose + |
+
|
+ Determinant
+ of a matrix: |
+
+ M>
+ A = [6 1 1; 4 -2 5; 2 8 7] |
+
+ P> A
+ = np.array([[6,1,1],[4,-2,5],[2,8,7]]) |
+
+ R>
+ A = matrix(c(6,1,1,4,-2,5,2,8,7), nrow=3, byrow=T) |
+
+ J>
+ A=[6 1 1; 4 -2 5; 2 8 7] |
+
+ Determinant
+ of a matrix: |
+
|
+ Inverse + of a matrix + |
+
+ M>
+ A = [4 7; 2 6] |
+
+ P>
+ A = np.array([[4, 7], [2, 6]]) |
+
+ R>
+ A = matrix(c(4,7,2,6), nrow=2, byrow=T) |
+
+ J>
+ A=[4 7; 2 6] |
+
+ Inverse + of a matrix + |
+
|
+ ADVANCED + MATRIX OPERATIONS + |
+ |||||
|
+ Calculating
+ the covariance matrix |
+
+ M>
+ x1 = [4.0000 4.2000 3.9000 4.3000 4.1000]’ |
+
+ P>
+ x1 = np.array([ 4, 4.2, 3.9, 4.3, 4.1]) |
+
+ R>
+ x1 = matrix(c(4, 4.2, 3.9, 4.3, 4.1), ncol=5) |
+
+ J>
+ x1=[4.0 4.2 3.9 4.3 4.1]'; |
+
+ Calculating
+ the covariance matrix |
+
|
+ Calculating |
+
+ M>
+ A = [3 1; 1 3] |
+
+ P>
+ A = np.array([[3, 1], [1, 3]]) |
+
+ R>
+ A = matrix(c(3,1,1,3), ncol=2) |
+
+ J>
+ A=[3 1; 1 3] |
+
+ Calculating |
+
|
+ Generating
+ a Gaussian dataset: |
+
+ %
+ requires statistics toolbox package |
+
+ P>
+ mean = np.array([0,0]) |
+
+ #
+ requires the ‘mass’ package |
+
+ #
+ requires the Distributions package from
+ https://github.com/JuliaStats/Distributions.jl |
+
+ Generating
+ a Gaussian dataset: |
+
+
$[bash]> conda create -n myenv python=3\n", + "$[bash]> source activate myenv\n", + "$[bash]> conda install -n myenv numpy scipy matplotlib scikit-learn\n", + "\n", + "When we start \"python\" in your current shell session now, it will use the Python distribution in the virtual environment \"myenv\" that we have just created. To un-attach the virtual environment, you can just use\n", + "
$[bash]> source deactivate myenv\n", + "\n", + "**Note:** environments will be created in ROOT_DIR/envs by default, you can use the `-p` instead of the `-n` flag in the conda commands above in order to specify a custom path.\n", + "\n", + "**I find this procedure very convenient, especially if you are working with different distributions and versions of Python with different modules and packages installed and it is extremely useful for testing your own modules.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065\n", + "1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050\n", + "[...]\n", + "2,12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520\n", + "2,12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680\n", + "[...]\n", + "3,12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630\n", + "3,12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
| predicted class | \n", + "\t\t||||
| class 1 | \n", + "\t\tclass 2 | \n", + "\t\tclass 3 | \n", + "\t||
| actual class | \n", + "\t\tclass 1 | \n", + "\t\tTrue positives | \n", + "\t\t||
| class 2 | \n", + "\t\tTrue positives | \n", + "\t\t|||
| class 3 | \n", + "\t\tTrue positives | \n", + "\t|||
a_namespace = {'name_a':object_1, 'name_b':object_2, ...} \n",
- "\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now, the tricky part is that we have multiple independent namespaces in Python, and names can be reused for different namespaces (only the objects are unique, for example:\n",
- "\n",
- "a_namespace = {'name_a':object_1, 'name_b':object_2, ...}\n",
- "b_namespace = {'name_a':object_3, 'name_b':object_4, ...}\n",
- "\n",
- "For example, everytime we call a `for-loop` or define a function, it will create its own namespace. Namespaces also have different levels of hierarchy (the so-called \"scope\"), which we will discuss in more detail in the next section."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Scope"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the section above, we have learned that namespaces can exist independently from each other and that they are structured in a certain hierarchy, which brings us to the concept of \"scope\". The \"scope\" in Python defines the \"hierarchy level\" in which we search namespaces for certain \"name-to-object\" mappings. \n",
- "For example, let us consider the following code:"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "i = 1\n",
- "\n",
- "def foo():\n",
- " i = 5\n",
- " print(i, 'in foo()')\n",
- "\n",
- "print(i, 'global')\n",
- "\n",
- "foo()"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1 global\n",
- "5 in foo()\n"
- ]
- }
- ],
- "prompt_number": 1
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Here, we just defined the variable name `i` twice, once on the `foo` function."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "- `foo_namespace = {'i':object_3, ...}` \n",
- "- `global_namespace = {'i':object_1, 'name_b':object_2, ...}`"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "So, how does Python now which namespace it has to search if we want to print the value of the variable `i`? This is where Python's LEGB-rule comes into play, which we will discuss in the next section."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Tip:\n",
- "If we want to print out the dictionary mapping of the global and local variables, we can use the\n",
- "the functions `global()` and `local()"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#print(globals()) # prints global namespace\n",
- "#print(locals()) # prints local namespace\n",
- "\n",
- "glob = 1\n",
- "\n",
- "def foo():\n",
- " loc = 5\n",
- " print('loc in foo():', 'loc' in locals())\n",
- "\n",
- "foo()\n",
- "print('loc in global:', 'loc' in globals()) \n",
- "print('glob in global:', 'foo' in globals())"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "loc in foo(): True\n",
- "loc in global: False\n",
- "glob in global: True\n"
- ]
- }
- ],
- "prompt_number": 11
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Scope resolution for variable names via the LEGB rule."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We have seen that multiple namespaces can exist independently from each other and that they can contain the same variable names on different hierachy levels. The \"scope\" defines on which hierarchy level Python searches for a particular \"variable name\" for its associated object. Now, the next question is: \"In which order does Python search the different levels of namespaces before it finds the name-to-object' mapping?\" \n",
- "To answer is: It uses the LEGB-rule, which stands for\n",
- "\n",
- "**Local -> Enclosed -> Global -> Built-in**, \n",
- "\n",
- "where the arrows should denote the direction of the namespace-hierarchy search order. \n",
- "\n",
- "- *Local* can be inside a function or class method, for example. \n",
- "- *Enclosed* can be its `enclosing` function, e.g., if a function is wrapped inside another function. \n",
- "- *Global* refers to the uppermost level of the executing script itself, and \n",
- "- *Built-in* are special names that Python reserves for itself. \n",
- "\n",
- "So, if a particular name:object mapping cannot be found in the local namespaces, the namespaces of the enclosed scope are being searched next. If the search in the enclosed scope is unsuccessful, too, Python moves on to the global namespace, and eventually, it will search the global namespaces (side note: if a name cannot found in any of the namespaces, a *NameError* will is raised).\n",
- "\n",
- "**Note**: \n",
- "Namespaces can also be further nested, for example if we import modules, or if we are defining new classes. In those cases we have to use prefixes to access those nested namespaces. Let me illustrate this concept in the following code block:"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "import numpy\n",
- "import math\n",
- "import scipy\n",
- "\n",
- "print(math.pi, 'from the math module')\n",
- "print(numpy.pi, 'from the numpy package')\n",
- "print(scipy.pi, 'from the scipy package')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "3.141592653589793 from the math module\n",
- "3.141592653589793 from the numpy package\n",
- "3.141592653589793 from the scipy package\n"
- ]
- }
- ],
- "prompt_number": 8
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "(This is also why we have to be careful if we import modules via \"`from a_module import *`\", since it loads the variable names into the global namespace and could potentially overwrite already existing variable names)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "raises an error\n", - "\n", - "**b)** \n", - "
\n", - "global value [ a_var outside a_func() ]\n", - "\n", - "**c)** \n", - "
global value [ a_var in a_func() ] \n", - "global value [ a_var outside a_func() ]\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[[go to solution](#solutions)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Here is why:\n", - "\n", - "We call `a_func()` first, which is supposed to print the value of `a_var`. According to the LEGB rule, the function will first look in its own local scope (L) if `a_var` is defined there. Since `a_func()` does not define its own `a_var`, it will look one-level above in the global scope (G) in which `a_var` has been defined previously.\n", - "
raises an error\n", - "\n", - "**b)** \n", - "
local value [ a_var in a_func() ]\n", - "global value [ a_var outside a_func() ]\n", - "\n", - "**c)** \n", - "
global value [ a_var in a_func() ] \n", - "global value [ a_var outside a_func() ]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[[go to solution](#solutions)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Here is why:\n", - "\n", - "When we call `a_func()`, it will first look in its local scope (L) for `a_var`, since `a_var` is defined in the local scope of `a_func`, its assigned value `local variable` is printed. Note that this doesn't affect the global variable, which is in a different scope." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
global value\n", - "\n", - "**b)** \n", - "
enclosed value\n", - "\n", - "**c)** \n", - "
local value" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[[go to solution](#solutions)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Here is why:\n", - "\n", - "Let us quickly recapitulate what we just did: We called `outer()`, which defined the variable `a_var` locally (next to an existing `a_var` in the global scope). Next, the `outer()` function called `inner()`, which in turn defined a variable with of name `a_var` as well. The `print()` function inside `inner()` searched in the local scope first (L->E) before it went up in the scope hierarchy, and therefore it printed the value that was assigned in the local scope." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similar to the concept of the `global` keyword, which we have seen in the section above, we can use the keyword `nonlocal` inside the inner function to explicitely access a variable from the outer (enclosed) scope in order to modify its value. \n", - "Note that the `nonlocal` keyword was added in Python 3.x and is not implemented in Python 2.x (yet)." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a_var = 'global value'\n", - "\n", - "def outer():\n", - " a_var = 'local value'\n", - " print('outer before:', a_var)\n", - " def inner():\n", - " nonlocal a_var\n", - " a_var = 'inner value'\n", - " print('in inner():', a_var)\n", - " inner()\n", - " print(\"outer after:\", a_var)\n", - "outer()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "outer before: local value\n", - "in inner(): inner value\n", - "outer after: inner value\n" - ] - } - ], - "prompt_number": 5 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "
raises an error (conflict with in-built `len()` function)\n", - "\n", - "**b)** \n", - "
called my len() function\n", - "Input variable is of length 13\n", - "\n", - "**c)** \n", - "
Input variable is of length 13" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[[go to solution](#solutions)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Here is why:\n", - "\n", - "Since the exact same names can be used to map names to different objects - as long as the names are in different name spaces - there is no problem of reusing the name `len` to define our own length function (this is just for demonstration pruposes, it is NOT recommended). As we go up in Python's L -> E -> G -> B hierarchy, the function `a_func()` finds `len()` already in the global scope first before it attempts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "
a_namespace = {'name_a':object_1, 'name_b':object_2, ...} \n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, the tricky part is that we have multiple independent namespaces in Python, and names can be reused for different namespaces (only the objects are unique, for example:\n",
+ "\n",
+ "a_namespace = {'name_a':object_1, 'name_b':object_2, ...}\n",
+ "b_namespace = {'name_a':object_3, 'name_b':object_4, ...}\n",
+ "\n",
+ "For example, everytime we call a `for-loop` or define a function, it will create its own namespace. Namespaces also have different levels of hierarchy (the so-called \"scope\"), which we will discuss in more detail in the next section."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Scope"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the section above, we have learned that namespaces can exist independently from each other and that they are structured in a certain hierarchy, which brings us to the concept of \"scope\". The \"scope\" in Python defines the \"hierarchy level\" in which we search namespaces for certain \"name-to-object\" mappings. \n",
+ "For example, let us consider the following code:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 global\n",
+ "5 in foo()\n"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In practice, **it is usually a bad idea to modify global variables inside the function scope**, since it often be the cause of confusion and weird errors that are hard to debug. \n",
- "If you want to modify a global variable via a function, it is recommended to pass it as an argument and reassign the return-value. \n",
- "For example:"
+ }
+ ],
+ "source": [
+ "i = 1\n",
+ "\n",
+ "def foo():\n",
+ " i = 5\n",
+ " print(i, 'in foo()')\n",
+ "\n",
+ "print(i, 'global')\n",
+ "\n",
+ "foo()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here, we just defined the variable name `i` twice, once on the `foo` function."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- `foo_namespace = {'i':object_3, ...}` \n",
+ "- `global_namespace = {'i':object_1, 'name_b':object_2, ...}`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "So, how does Python know which namespace it has to search if we want to print the value of the variable `i`? This is where Python's LEGB-rule comes into play, which we will discuss in the next section."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Tip:\n",
+ "If we want to print out the dictionary mapping of the global and local variables, we can use the\n",
+ "the functions `global()` and `local()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "loc in foo(): True\n",
+ "loc in global: False\n",
+ "glob in global: True\n"
]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "a_var = 2\n",
- "\n",
- "def a_func(some_var):\n",
- " return 2**3\n",
- "\n",
- "a_var = a_func(a_var)\n",
- "print(a_var)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "8\n"
- ]
- }
- ],
- "prompt_number": 42
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "raises an error\n", + "\n", + "**b)** \n", + "
\n", + "global value [ a_var outside a_func() ]\n", + "\n", + "**c)** \n", + "
global value [ a_var inside a_func() ] \n", + "global value [ a_var outside a_func() ]\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[[go to solution](#solutions)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Here is why:\n", + "\n", + "We call `a_func()` first, which is supposed to print the value of `a_var`. According to the LEGB rule, the function will first look in its own local scope (L) if `a_var` is defined there. Since `a_func()` does not define its own `a_var`, it will look one-level above in the global scope (G) in which `a_var` has been defined previously.\n", + "
raises an error\n", + "\n", + "**b)** \n", + "
local value [ a_var inside a_func() ]\n", + "global value [ a_var outside a_func() ]\n", + "\n", + "**c)** \n", + "
global value [ a_var inside a_func() ] \n", + "global value [ a_var outside a_func() ]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[[go to solution](#solutions)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Here is why:\n", + "\n", + "When we call `a_func()`, it will first look in its local scope (L) for `a_var`, since `a_var` is defined in the local scope of `a_func`, its assigned value `local variable` is printed. Note that this doesn't affect the global variable, which is in a different scope." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
global value\n", + "\n", + "**b)** \n", + "
enclosed value\n", + "\n", + "**c)** \n", + "
local value" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[[go to solution](#solutions)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Here is why:\n", + "\n", + "Let us quickly recapitulate what we just did: We called `outer()`, which defined the variable `a_var` locally (next to an existing `a_var` in the global scope). Next, the `outer()` function called `inner()`, which in turn defined a variable with of name `a_var` as well. The `print()` function inside `inner()` searched in the local scope first (L->E) before it went up in the scope hierarchy, and therefore it printed the value that was assigned in the local scope." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to the concept of the `global` keyword, which we have seen in the section above, we can use the keyword `nonlocal` inside the inner function to explicitly access a variable from the outer (enclosed) scope in order to modify its value. \n", + "Note that the `nonlocal` keyword was added in Python 3.x and is not implemented in Python 2.x (yet)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "outer before: local value\n", + "in inner(): inner value\n", + "outer after: inner value\n" ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for a in range(5):\n", - " if a == 4:\n", - " print(a, '-> a in for-loop')\n", - "print(a, '-> a in global')" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "4 -> a in for-loop\n", - "4 -> a in global\n" - ] - } - ], - "prompt_number": 5 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**This also applies if we explicitely defined the `for-loop` variable in the global namespace before!** In this case it will rebind the existing variable:" + } + ], + "source": [ + "a_var = 'global value'\n", + "\n", + "def outer():\n", + " a_var = 'local value'\n", + " print('outer before:', a_var)\n", + " def inner():\n", + " nonlocal a_var\n", + " a_var = 'inner value'\n", + " print('in inner():', a_var)\n", + " inner()\n", + " print(\"outer after:\", a_var)\n", + "outer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
raises an error (conflict with in-built `len()` function)\n", + "\n", + "**b)** \n", + "
called my len() function\n", + "Input variable is of length 13\n", + "\n", + "**c)** \n", + "
Input variable is of length 13" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[[go to solution](#solutions)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Here is why:\n", + "\n", + "Since the exact same names can be used to map names to different objects - as long as the names are in different name spaces - there is no problem of reusing the name `len` to define our own length function (this is just for demonstration pruposes, it is NOT recommended). As we go up in Python's L -> E -> G -> B hierarchy, the function `a_func()` finds `len()` already in the global scope (G) first before it attempts to search the built-in (B) namespace." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
\n",
- "print(4, '-> i in global')\n",
- ""
+ }
+ ],
+ "source": [
+ "for a in range(5):\n",
+ " if a == 4:\n",
+ " print(a, '-> a in for-loop')\n",
+ "print(a, '-> a in global')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**This also applies if we explicitly defined the `for-loop` variable in the global namespace before!** In this case it will rebind the existing variable:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4 -> b in for-loop\n",
+ "4 -> b in global\n"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This goes back to a change that was made in Python 3.x and is described in [What\u2019s New In Python 3.0](https://docs.python.org/3/whatsnew/3.0.html) as follows:\n",
- "\n",
- "\"List comprehensions no longer support the syntactic form `[... for var in item1, item2, ...]`. Use `[... for var in (item1, item2, ...)]` instead. Also note that list comprehensions have different semantics: they are closer to syntactic sugar for a generator expression inside a `list()` constructor, and in particular the loop control variables are no longer leaked into the surrounding scope.\""
+ }
+ ],
+ "source": [
+ "b = 1\n",
+ "for b in range(5):\n",
+ " if b == 4:\n",
+ " print(b, '-> b in for-loop')\n",
+ "print(b, '-> b in global')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "However, in **Python 3.x**, we can use closures to prevent the for-loop variable to cut into the global namespace. Here is an example (exectuted in Python 3.4):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0, 1, 2, 3, 4]\n",
+ "1 -> i in global\n"
]
}
],
- "metadata": {}
+ "source": [
+ "i = 1\n",
+ "print([i for i in range(5)])\n",
+ "print(i, '-> i in global')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Why did I mention \"Python 3.x\"? Well, as it happens, the same code executed in Python 2.x would print:\n",
+ "\n",
+ "\n",
+ "4 -> i in global\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This goes back to a change that was made in Python 3.x and is described in [What’s New In Python 3.0](https://docs.python.org/3/whatsnew/3.0.html) as follows:\n",
+ "\n",
+ "\"List comprehensions no longer support the syntactic form `[... for var in item1, item2, ...]`. Use `[... for var in (item1, item2, ...)]` instead. Also note that list comprehensions have different semantics: they are closer to syntactic sugar for a generator expression inside a `list()` constructor, and in particular the loop control variables are no longer leaked into the surrounding scope.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": []
}
- ]
-}
\ No newline at end of file
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tutorials/sorting_csvs.ipynb b/tutorials/sorting_csvs.ipynb
new file mode 100644
index 0000000..df1b182
--- /dev/null
+++ b/tutorials/sorting_csvs.ipynb
@@ -0,0 +1,757 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:f56b7081a6e5b63610100fcfa0a226c7a0184dfe0d63128614a7a68555653428"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[Sebastian Raschka](http://sebastianraschka.com) \n",
+ "last updated: 05/13/2014\n",
+ "\n",
+ "- Open in [IPython nbviewer](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/sorting_csvs.ipynb?create=1) \n",
+ "- Link to this [IPython notebook on Github](https://github.com/rasbt/python_reference/blob/master/tutorials/sorting_csvs.ipynb) \n",
+ "- Link to the GitHub Repository [`python_reference`](https://github.com/rasbt/python_reference)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "I am looking forward to comments or suggestions, please don't hesitate to contact me via\n",
+ "[twitter](https://twitter.com/rasbt), [email](mailto:bluewoodtree@gmail.com), or [google+](https://plus.google.com/118404394130788869227).\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sorting CSV files using the Python `csv` module"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "I wanted to summarize a way to sort CSV files by just using the [`csv` module](https://docs.python.org/3.4/library/csv.html) and other standard library Python modules \n",
+ "(you probably also want to consider using the [pandas](http://pandas.pydata.org) library if you are working with very large CSV files - I am planning to make this a separate topic)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
\n",
+ "
\n",
+ "## Sections\n",
+ "- [Reading in a CSV file](#reading)\n",
+ "- [Printing the CSV file contents](#printing)\n",
+ "- [Converting numeric cells to floats](#floats)\n",
+ "- [Sorting the CSV file](#sorting)\n",
+ "- [Marking min/max values in particular columns](#marking)\n",
+ "- [Writing out the modified table to as a new CSV file](#writing)\n",
+ "- [Batch processing CSV files](#batch)\n",
+ "
\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Objective:\n",
+ "\n",
+ "Let us assume that we have an [example CSV](../Data/test.csv) file formatted like this:\n",
+ " \n",
+ "name,column1,column2,column3\n",
+ "abc,1.1,4.2,1.2\n",
+ "def,2.1,1.4,5.2\n",
+ "ghi,1.5,1.2,2.1\n",
+ "jkl,1.8,1.1,4.2\n",
+ "mno,9.4,6.6,6.2\n",
+ "pqr,1.4,8.3,8.4
\n",
+ "\n",
+ "And we want to sort particular columns and eventually mark min- of max-values in the table.\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##Reading in a CSV file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Because we will be iterating over our CSV file a couple of times, let us read in the CSV file using the `csv` module and hold the contents in memory using a Python list object (note: be careful with very large CSV files and possible memory issues associated with this approach).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import csv\n",
+ "\n",
+ "def csv_to_list(csv_file, delimiter=','):\n",
+ " \"\"\" \n",
+ " Reads in a CSV file and returns the contents as list,\n",
+ " where every row is stored as a sublist, and each element\n",
+ " in the sublist represents 1 cell in the table.\n",
+ " \n",
+ " \"\"\"\n",
+ " with open(csv_file, 'r') as csv_con:\n",
+ " reader = csv.reader(csv_con, delimiter=delimiter)\n",
+ " return list(reader)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 1
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "csv_cont = csv_to_list('../Data/test.csv')\n",
+ "\n",
+ "print('first 3 rows:')\n",
+ "for row in range(3):\n",
+ " print(csv_cont[row])"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "first 3 rows:\n",
+ "['name', 'column1', 'column2', 'column3']\n",
+ "['abc', '1.1', '4.2', '1.2']\n",
+ "['def', '2.1', '1.4', '5.2']\n"
+ ]
+ }
+ ],
+ "prompt_number": 2
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##Printing the CSV file contents"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Also, let us define a short function that prints out the CSV file to the standard output screen in a slightly prettier format:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def print_csv(csv_content):\n",
+ " \"\"\" Prints CSV file to standard output.\"\"\"\n",
+ " print(50*'-')\n",
+ " for row in csv_content:\n",
+ " row = [str(e) for e in row]\n",
+ " print('\\t'.join(row))\n",
+ " print(50*'-')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 3
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "csv_cont = csv_to_list('../Data/test.csv')\n",
+ "\n",
+ "print('\\n\\nOriginal CSV file:')\n",
+ "print_csv(csv_cont)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "\n",
+ "Original CSV file:\n",
+ "--------------------------------------------------\n",
+ "name\tcolumn1\tcolumn2\tcolumn3\n",
+ "abc\t1.1\t4.2\t1.2\n",
+ "def\t2.1\t1.4\t5.2\n",
+ "ghi\t1.5\t1.2\t-2.1\n",
+ "jkl\t1.8\t-1.1\t4.2\n",
+ "mno\t9.4\t6.6\t6.2\n",
+ "pqr\t1.4\t8.3\t8.4\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "prompt_number": 4
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Converting numeric cells to floats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To avoid problems with the sorting approach that can occur when we have negative values in some cells, let us define a function that converts all numeric cells into float values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def convert_cells_to_floats(csv_cont):\n",
+ " \"\"\" \n",
+ " Converts cells to floats if possible\n",
+ " (modifies input CSV content list).\n",
+ " \n",
+ " \"\"\"\n",
+ " for row in range(len(csv_cont)):\n",
+ " for cell in range(len(csv_cont[row])):\n",
+ " try:\n",
+ " csv_cont[row][cell] = float(csv_cont[row][cell])\n",
+ " except ValueError:\n",
+ " pass "
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 5
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "print('first 3 rows:')\n",
+ "for row in range(3):\n",
+ " print(csv_cont[row])"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "first 3 rows:\n",
+ "['name', 'column1', 'column2', 'column3']\n",
+ "['abc', '1.1', '4.2', '1.2']\n",
+ "['def', '2.1', '1.4', '5.2']\n"
+ ]
+ }
+ ],
+ "prompt_number": 6
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##Sorting the CSV file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Using the very handy [`operator.itemgetter`](https://docs.python.org/3.4/library/operator.html#operator.itemgetter) function, we define a function that returns a CSV file contents sorted by a particular column (column index or column name)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import operator\n",
+ "\n",
+ "def sort_by_column(csv_cont, col, reverse=False):\n",
+ " \"\"\" \n",
+ " Sorts CSV contents by column name (if col argument is type ) \n",
+ " or column index (if col argument is type ). \n",
+ " \n",
+ " \"\"\"\n",
+ " header = csv_cont[0]\n",
+ " body = csv_cont[1:]\n",
+ " if isinstance(col, str): \n",
+ " col_index = header.index(col)\n",
+ " else:\n",
+ " col_index = col\n",
+ " body = sorted(body, \n",
+ " key=operator.itemgetter(col_index), \n",
+ " reverse=reverse)\n",
+ " body.insert(0, header)\n",
+ " return body"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 7
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To see how (and if) it works, let us sort the CSV file in [../Data/test.csv](../Data/test.csv) by the column name \"column3\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "csv_cont = csv_to_list('../Data/test.csv')\n",
+ "\n",
+ "print('\\n\\nOriginal CSV file:')\n",
+ "print_csv(csv_cont)\n",
+ "\n",
+ "print('\\n\\nCSV sorted by column \"column3\":')\n",
+ "convert_cells_to_floats(csv_cont)\n",
+ "csv_sorted = sort_by_column(csv_cont, 'column3')\n",
+ "print_csv(csv_sorted)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "\n",
+ "Original CSV file:\n",
+ "--------------------------------------------------\n",
+ "name\tcolumn1\tcolumn2\tcolumn3\n",
+ "abc\t1.1\t4.2\t1.2\n",
+ "def\t2.1\t1.4\t5.2\n",
+ "ghi\t1.5\t1.2\t-2.1\n",
+ "jkl\t1.8\t-1.1\t4.2\n",
+ "mno\t9.4\t6.6\t6.2\n",
+ "pqr\t1.4\t8.3\t8.4\n",
+ "--------------------------------------------------\n",
+ "\n",
+ "\n",
+ "CSV sorted by column \"column3\":\n",
+ "--------------------------------------------------\n",
+ "name\tcolumn1\tcolumn2\tcolumn3\n",
+ "ghi\t1.5\t1.2\t-2.1\n",
+ "abc\t1.1\t4.2\t1.2\n",
+ "jkl\t1.8\t-1.1\t4.2\n",
+ "def\t2.1\t1.4\t5.2\n",
+ "mno\t9.4\t6.6\t6.2\n",
+ "pqr\t1.4\t8.3\t8.4\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "prompt_number": 8
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Marking min/max values in particular columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To visualize minimum and maximum values in certain columns if find it quite useful to add little symbols to the cells (most people like to highlight cells with colors in e.g., Excel spreadsheets, but CSV doesn't support colors, so this is my workaround - please let me know if you figured out a better approach, I would be looking forward to your suggestion)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def mark_minmax(csv_cont, col, mark_max=True, marker='*'):\n",
+ " \"\"\"\n",
+ " Sorts a list of CSV contents by a particular column \n",
+ " (see sort_by_column function).\n",
+ " Puts a marker on the maximum value if mark_max=True,\n",
+ " or puts a marker on the minimum value mark_max=False\n",
+ " (modifies input CSV content list).\n",
+ " \n",
+ " \"\"\"\n",
+ " \n",
+ " sorted_csv = sort_by_column(csv_cont, col, reverse=mark_max)\n",
+ " if isinstance(col, str): \n",
+ " col_index = sorted_csv[0].index(col)\n",
+ " else:\n",
+ " col_index = col\n",
+ " sorted_csv[1][col_index] = str(sorted_csv[1][col_index]) + marker\n",
+ " return None"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 9
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def mark_all_col(csv_cont, mark_max=True, marker='*'):\n",
+ " \"\"\"\n",
+ " Marks all maximum (if mark_max=True) or minimum (if mark_max=False)\n",
+ " values in all columns of a CSV contents list - except the first column.\n",
+ " Returns a new list that is sorted by the names in the first column\n",
+ " (modifies input CSV content list).\n",
+ " \n",
+ " \"\"\"\n",
+ " for c in range(1, len(csv_cont[0])):\n",
+ " mark_minmax(csv_cont, c, mark_max, marker)\n",
+ " marked_csv = sort_by_column(csv_cont, 0, False)\n",
+ " return marked_csv"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 10
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import copy\n",
+ "\n",
+ "csv_cont = csv_to_list('../Data/test.csv')\n",
+ "\n",
+ "csv_marked = copy.deepcopy(csv_cont)\n",
+ "convert_cells_to_floats(csv_marked)\n",
+ "mark_all_col(csv_marked, mark_max=False, marker='*')\n",
+ "print_csv(csv_marked)\n",
+ "print('*: min-value')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "--------------------------------------------------\n",
+ "name\tcolumn1\tcolumn2\tcolumn3\n",
+ "abc\t1.1*\t4.2\t1.2\n",
+ "def\t2.1\t1.4\t5.2\n",
+ "ghi\t1.5\t1.2\t-2.1*\n",
+ "jkl\t1.8\t-1.1*\t4.2\n",
+ "mno\t9.4\t6.6\t6.2\n",
+ "pqr\t1.4\t8.3\t8.4\n",
+ "--------------------------------------------------\n",
+ "*: min-value\n"
+ ]
+ }
+ ],
+ "prompt_number": 12
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Writing out the modified table to as a new CSV file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After the sorting and maybe marking of minimum and maximum values, we likely want to write out the modified data table as CSV file again."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def write_csv(dest, csv_cont):\n",
+ " \"\"\" Writes a comma-delimited CSV file. \"\"\"\n",
+ "\n",
+ " with open(dest, 'w') as out_file:\n",
+ " writer = csv.writer(out_file, delimiter=',')\n",
+ " for row in csv_cont:\n",
+ " writer.writerow(row)\n",
+ "\n",
+ "write_csv('../Data/test_marked.csv', csv_marked)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 13
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let us read in the written CSV file to confirm that the formatting is correct:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "csv_cont = csv_to_list('../Data/test_marked.csv')\n",
+ "\n",
+ "print('\\n\\nWritten CSV file:')\n",
+ "print_csv(csv_cont)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "\n",
+ "Written CSV file:\n",
+ "--------------------------------------------------\n",
+ "name\tcolumn1\tcolumn2\tcolumn3\n",
+ "abc\t1.1*\t4.2\t1.2\n",
+ "def\t2.1\t1.4\t5.2\n",
+ "ghi\t1.5\t1.2\t-2.1*\n",
+ "jkl\t1.8\t-1.1*\t4.2\n",
+ "mno\t9.4\t6.6\t6.2\n",
+ "pqr\t1.4\t8.3\t8.4\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "prompt_number": 14
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Batch processing CSV files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Usually, CSV files never come alone, but we have to process a whole bunch of similar formatted CSV files from some output device. \n",
+ "For example, if we want to process all CSV files in a particular input directory and want to save the processed files in a separate output directory, we can use a simple list comprehension to collect tuples of input-output file names."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import os\n",
+ "\n",
+ "in_dir = '../Data'\n",
+ "out_dir = '../Data/processed'\n",
+ "csvs = [\n",
+ " (os.path.join(in_dir, csv), \n",
+ " os.path.join(out_dir, csv))\n",
+ " for csv in os.listdir(in_dir) \n",
+ " if csv.endswith('.csv')\n",
+ " ]\n",
+ "\n",
+ "for i in csvs:\n",
+ " print(i)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "('../Data/test.csv', '../Data/processed/test.csv')\n",
+ "('../Data/test_marked.csv', '../Data/processed/test_marked.csv')\n"
+ ]
+ }
+ ],
+ "prompt_number": 12
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "Next, we can summarize the processes we want to apply to the CSV files in a simple function and loop over our file names:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def process_csv(csv_in, csv_out):\n",
+ " \"\"\" \n",
+ " Takes an input- and output-filename of an CSV file\n",
+ " and marks minimum values for every column.\n",
+ " \n",
+ " \"\"\"\n",
+ " csv_cont = csv_to_list(csv_in)\n",
+ " csv_marked = copy.deepcopy(csv_cont)\n",
+ " convert_cells_to_floats(csv_marked)\n",
+ " mark_all_col(csv_marked, mark_max=False, marker='*')\n",
+ " write_csv(csv_out, csv_marked)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 18
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "for inout in csvs:\n",
+ " process_csv(inout[0], inout[1])"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": []
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/sqlite3_howto/LICENSE b/tutorials/sqlite3_howto/LICENSE
similarity index 100%
rename from sqlite3_howto/LICENSE
rename to tutorials/sqlite3_howto/LICENSE
diff --git a/sqlite3_howto/README.md b/tutorials/sqlite3_howto/README.md
similarity index 89%
rename from sqlite3_howto/README.md
rename to tutorials/sqlite3_howto/README.md
index 9549f63..c596dfc 100644
--- a/sqlite3_howto/README.md
+++ b/tutorials/sqlite3_howto/README.md
@@ -6,7 +6,7 @@ _\-- written by Sebastian Raschka_ on March 7, 2014
-
+
@@ -29,7 +29,7 @@ _\-- written by Sebastian Raschka_ on March 7, 2014
• Conclusion
The complete Python code that I am using in this tutorial can be downloaded
-from my GitHub repository:
+from my GitHub repository: [https://github.com/rasbt/python_reference/tree/master/tutorials/sqlite3_howto](https://github.com/rasbt/python_reference/tree/master/tutorials/sqlite3_howto)
* * *
@@ -97,7 +97,7 @@ there is more information about PRIMARY KEYs further down in this section).
- mport sqlite3
+ import sqlite3
sqlite_file = 'my_first_db.sqlite' # name of the sqlite database file
table_name1 = 'my_table_1' # name of the table to be created
@@ -123,8 +123,7 @@ there is more information about PRIMARY KEYs further down in this section).
conn.close()
-Download the script: [create_new_db.py](https://raw.github.com/rasbt/python_sq
-lite_code/master/code/create_new_db.py)
+Download the script: [create_new_db.py](https://github.com/rasbt/python_reference/blob/master/tutorials/sqlite3_howto/code/create_new_db.py)
* * *
@@ -135,7 +134,7 @@ lite_code/master/code/create_new_db.py)
-
+
Using the code above, we created a new `.sqlite` database file with 2 tables.
Each table consists of currently one column only, which is of type INTEGER.
@@ -208,12 +207,11 @@ Let's have a look at some code:
conn.close()
-Download the script: [add_new_column.py](https://raw.github.com/rasbt/python_s
-qlite_code/master/code/add_new_column.py)
+Download the script: [add_new_column.py](https://github.com/rasbt/python_reference/blob/master/tutorials/sqlite3_howto/code/add_new_column.py)
-
+
We just added 2 more columns (`my_2nd_column` and `my_3rd_column`) to
@@ -272,10 +270,9 @@ But let us first have a look at the example code:
conn.close()
-Download the script: [update_or_insert_records.py](https://raw.github.com/rasb
-t/python_sqlite_code/master/code/update_or_insert_records.py)
+Download the script: [update_or_insert_records.py](code/update_or_insert_records.py)
-
+
Both A) `INSERT` and B) `INSERT OR IGNORE` have in common that they append new
rows to the database if a given PRIMARY KEY does not exist in the database
@@ -337,10 +334,9 @@ drop the index, which is also shown in the code below.
conn.close()
-Download the script: [create_unique_index.py](https://raw.github.com/rasbt/pyt
-hon_sqlite_code/master/code/create_unique_index.py)
+Download the script: [create_unique_index.py](code/create_unique_index.py)
-
+
@@ -403,19 +399,17 @@ row entries for all or some columns if they match certain criteria.
conn.close()
-Download the script: [selecting_entries.py](https://raw.github.com/rasbt/pytho
-n_sqlite_code/master/code/selecting_entries.py)
+Download the script: [selecting_entries.py](code/selecting_entries.py)
-
+
if we use the `.fetchall()` method, we return a list of tuples from the
database query, where each tuple represents one row entry. The print output
for the 5 different cases shown in the code above would look like this (note
that we only have a table with 1 row here):
-
+
@@ -545,12 +539,11 @@ that have been added xxx days ago.
conn.close()
-Download the script: [date_time_ops.py](https://raw.github.com/rasbt/python_sq
-lite_code/master/code/date_time_ops.py)
+Download the script: [date_time_ops.py](code/date_time_ops.py)
-
+
Some of the really convenient functions that return the current time and date
@@ -585,7 +578,7 @@ and entries that are older than 1 day via
Note that we don't have to provide the complete time stamps here, the same
syntax applies to simple dates or simple times only, too.
-
+
@@ -593,7 +586,7 @@ syntax applies to simple dates or simple times only, too.
#### Update Mar 16, 2014:
-If'd we are interested to calulate the hours between two `DATETIME()`
+If'd we are interested to calculate the hours between two `DATETIME()`
timestamps, we can could use the handy `STRFTIME()` function like this
@@ -648,10 +641,9 @@ column names):
conn.close()
-Download the script: [get_columnnames.py](https://raw.github.com/rasbt/python_
-sqlite_code/master/code/get_columnnames.py)
+Download the script: [get_columnnames.py](code/get_columnnames.py)
-
+
Since we haven't created a PRIMARY KEY column for `my_table_3`, SQLite
automatically provides an indexed `rowid` column with unique ascending integer
@@ -669,7 +661,7 @@ grab the 2nd value in each tuple of the returned list, which can be done by
after the `PRAGMA TABLE_INFO()` call. If we would print the contents of the
variable `names` now, the output would look like this:
-
+
@@ -685,53 +677,58 @@ convenient script to print a nice overview of SQLite database tables:
import sqlite3
-
+
+
def connect(sqlite_file):
""" Make connection to an SQLite database file """
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()
return conn, c
-
+
+
def close(conn):
""" Commit changes and close connection to the database """
# conn.commit()
conn.close()
-
+
+
def total_rows(cursor, table_name, print_out=False):
""" Returns the total number of rows in the database """
- c.execute('SELECT COUNT(*) FROM {}'.format(table_name))
- count = c.fetchall()
+ cursor.execute('SELECT COUNT(*) FROM {}'.format(table_name))
+ count = cursor.fetchall()
if print_out:
print('\nTotal rows: {}'.format(count[0][0]))
return count[0][0]
-
+
+
def table_col_info(cursor, table_name, print_out=False):
- """
- Returns a list of tuples with column informations:
- (id, name, type, notnull, default_value, primary_key)
-
+ """ Returns a list of tuples with column informations:
+ (id, name, type, notnull, default_value, primary_key)
"""
- c.execute('PRAGMA TABLE_INFO({})'.format(table_name))
- info = c.fetchall()
-
+ cursor.execute('PRAGMA TABLE_INFO({})'.format(table_name))
+ info = cursor.fetchall()
+
if print_out:
print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
print(col)
return info
-
+
+
def values_in_col(cursor, table_name, print_out=True):
- """ Returns a dictionary with columns as keys and the number of not-null
- entries as associated values.
+ """ Returns a dictionary with columns as keys
+ and the number of not-null entries as associated values.
"""
- c.execute('PRAGMA TABLE_INFO({})'.format(table_name))
- info = c.fetchall()
+ cursor.execute('PRAGMA TABLE_INFO({})'.format(table_name))
+ info = cursor.fetchall()
col_dict = dict()
for col in info:
col_dict[col[1]] = 0
for col in col_dict:
- c.execute('SELECT ({0}) FROM {1} WHERE {0} IS NOT NULL'.format(col, table_name))
- # In my case this approach resulted in a better performance than using COUNT
+ c.execute('SELECT ({0}) FROM {1} '
+ 'WHERE {0} IS NOT NULL'.format(col, table_name))
+ # In my case this approach resulted in a
+ # better performance than using COUNT
number_rows = len(c.fetchall())
col_dict[col] = number_rows
if print_out:
@@ -739,27 +736,26 @@ convenient script to print a nice overview of SQLite database tables:
for i in col_dict.items():
print('{}: {}'.format(i[0], i[1]))
return col_dict
-
-
+
+
if __name__ == '__main__':
-
+
sqlite_file = 'my_first_db.sqlite'
table_name = 'my_table_3'
-
+
conn, c = connect(sqlite_file)
total_rows(c, table_name, print_out=True)
table_col_info(c, table_name, print_out=True)
- values_in_col(c, table_name, print_out=True) # slow on large data bases
-
+ # next line might be slow on large databases
+ values_in_col(c, table_name, print_out=True)
+
close(conn)
-
-Download the script: [print_db_info.py](https://raw.github.com/rasbt/python_sq
-lite_code/master/code/print_db_info.py)
+Download the script: [print_db_info.py](code/print_db_info.py)
-
+
-
+
diff --git a/sqlite3_howto/code/add_new_column.py b/tutorials/sqlite3_howto/code/add_new_column.py
similarity index 100%
rename from sqlite3_howto/code/add_new_column.py
rename to tutorials/sqlite3_howto/code/add_new_column.py
diff --git a/sqlite3_howto/code/create_new_db.py b/tutorials/sqlite3_howto/code/create_new_db.py
similarity index 100%
rename from sqlite3_howto/code/create_new_db.py
rename to tutorials/sqlite3_howto/code/create_new_db.py
diff --git a/sqlite3_howto/code/create_unique_index.py b/tutorials/sqlite3_howto/code/create_unique_index.py
similarity index 100%
rename from sqlite3_howto/code/create_unique_index.py
rename to tutorials/sqlite3_howto/code/create_unique_index.py
diff --git a/sqlite3_howto/code/date_time_ops.py b/tutorials/sqlite3_howto/code/date_time_ops.py
similarity index 100%
rename from sqlite3_howto/code/date_time_ops.py
rename to tutorials/sqlite3_howto/code/date_time_ops.py
diff --git a/sqlite3_howto/code/get_columnnames.py b/tutorials/sqlite3_howto/code/get_columnnames.py
similarity index 100%
rename from sqlite3_howto/code/get_columnnames.py
rename to tutorials/sqlite3_howto/code/get_columnnames.py
diff --git a/sqlite3_howto/code/print_db_info.py b/tutorials/sqlite3_howto/code/print_db_info.py
similarity index 67%
rename from sqlite3_howto/code/print_db_info.py
rename to tutorials/sqlite3_howto/code/print_db_info.py
index 22b72a8..285a635 100644
--- a/sqlite3_howto/code/print_db_info.py
+++ b/tutorials/sqlite3_howto/code/print_db_info.py
@@ -22,52 +22,57 @@
import sqlite3
+
def connect(sqlite_file):
""" Make connection to an SQLite database file """
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()
return conn, c
+
def close(conn):
""" Commit changes and close connection to the database """
- #conn.commit()
+ # conn.commit()
conn.close()
+
def total_rows(cursor, table_name, print_out=False):
""" Returns the total number of rows in the database """
- c.execute('SELECT COUNT(*) FROM {}'.format(table_name))
- count = c.fetchall()
+ cursor.execute('SELECT COUNT(*) FROM {}'.format(table_name))
+ count = cursor.fetchall()
if print_out:
print('\nTotal rows: {}'.format(count[0][0]))
return count[0][0]
+
def table_col_info(cursor, table_name, print_out=False):
- """
- Returns a list of tuples with column informations:
- (id, name, type, notnull, default_value, primary_key)
-
+ """ Returns a list of tuples with column informations:
+ (id, name, type, notnull, default_value, primary_key)
"""
- c.execute('PRAGMA TABLE_INFO({})'.format(table_name))
- info = c.fetchall()
-
+ cursor.execute('PRAGMA TABLE_INFO({})'.format(table_name))
+ info = cursor.fetchall()
+
if print_out:
print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
print(col)
return info
+
def values_in_col(cursor, table_name, print_out=True):
- """ Returns a dictionary with columns as keys and the number of not-null
- entries as associated values.
+ """ Returns a dictionary with columns as keys
+ and the number of not-null entries as associated values.
"""
- c.execute('PRAGMA TABLE_INFO({})'.format(table_name))
- info = c.fetchall()
+ cursor.execute('PRAGMA TABLE_INFO({})'.format(table_name))
+ info = cursor.fetchall()
col_dict = dict()
for col in info:
col_dict[col[1]] = 0
for col in col_dict:
- c.execute('SELECT ({0}) FROM {1} WHERE {0} IS NOT NULL'.format(col, table_name))
- # In my case this approach resulted in a better performance than using COUNT
+ c.execute('SELECT ({0}) FROM {1} '
+ 'WHERE {0} IS NOT NULL'.format(col, table_name))
+ # In my case this approach resulted in a
+ # better performance than using COUNT
number_rows = len(c.fetchall())
col_dict[col] = number_rows
if print_out:
@@ -85,7 +90,7 @@ def values_in_col(cursor, table_name, print_out=True):
conn, c = connect(sqlite_file)
total_rows(c, table_name, print_out=True)
table_col_info(c, table_name, print_out=True)
- values_in_col(c, table_name, print_out=True) # slow on large data bases
-
- close(conn)
+ # next line might be slow on large databases
+ values_in_col(c, table_name, print_out=True)
+ close(conn)
diff --git a/sqlite3_howto/code/selecting_entries.py b/tutorials/sqlite3_howto/code/selecting_entries.py
similarity index 100%
rename from sqlite3_howto/code/selecting_entries.py
rename to tutorials/sqlite3_howto/code/selecting_entries.py
diff --git a/sqlite3_howto/code/update_or_insert_records.py b/tutorials/sqlite3_howto/code/update_or_insert_records.py
similarity index 94%
rename from sqlite3_howto/code/update_or_insert_records.py
rename to tutorials/sqlite3_howto/code/update_or_insert_records.py
index 37292a5..ee461ec 100644
--- a/sqlite3_howto/code/update_or_insert_records.py
+++ b/tutorials/sqlite3_howto/code/update_or_insert_records.py
@@ -1,6 +1,6 @@
# Sebastian Raschka, 2014
# Update records or insert them if they don't exist.
-# Note that this is a workaround to accomodate for missing
+# Note that this is a workaround to accommodate for missing
# SQL features in SQLite.
import sqlite3
diff --git a/sqlite3_howto/code/updating_rows.py b/tutorials/sqlite3_howto/code/updating_rows.py
similarity index 100%
rename from sqlite3_howto/code/updating_rows.py
rename to tutorials/sqlite3_howto/code/updating_rows.py
diff --git a/sqlite3_howto/code/write_from_sqlite.py b/tutorials/sqlite3_howto/code/write_from_sqlite.py
similarity index 100%
rename from sqlite3_howto/code/write_from_sqlite.py
rename to tutorials/sqlite3_howto/code/write_from_sqlite.py
diff --git a/tutorials/table_of_contents_ipython.ipynb b/tutorials/table_of_contents_ipython.ipynb
new file mode 100644
index 0000000..1245132
--- /dev/null
+++ b/tutorials/table_of_contents_ipython.ipynb
@@ -0,0 +1,281 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:34307c4f0973ebef511e97c036657231fc4e230e7627cfe073d89f4046f9ce9f"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[Sebastian Raschka](http://sebastianraschka.com) \n",
+ "last updated: 05/29/2014"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "I would be happy to hear your comments and suggestions. \n",
+ "Please feel free to drop me a note via\n",
+ "[twitter](https://twitter.com/rasbt), [email](mailto:bluewoodtree@gmail.com), or [google+](https://plus.google.com/118404394130788869227).\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Creating a table of contents with internal links in IPython Notebooks and Markdown documents"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Many people have asked me how I create the table of contents with internal links for my IPython notebooks and Markdown documents on GitHub. \n",
+ "Well, no (IPython) magic is involved, it is just a little bit of HTML, but I thought it might be worthwhile to write this little how-to tutorial."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
\n",
+ "For example, [click this link](#bottom) to jump to the bottom of the page.\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## The two components to create an internal link"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "So how does it work? Basically, all you need are those two components: \n",
+ "1. the destination\n",
+ "2. an internal hyperlink to the destination"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###1. The destination"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To define the destination (i.e., the section on the page or the cell you want to jump to), you just need to insert an empty HTML anchor tag and give it an **`id`**, \n",
+ "e.g., **``** \n",
+ "\n",
+ "This anchor tag will be invisible if you render it as Markdown in the IPython notebook. \n",
+ "Note that it would also work if we use the **`name`** attribute instead of **`id`**, but since the **`name`** attribute is not supported by HTML5 anymore, I would suggest to just use the **`id`** attribute, which is also shorter to type."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###2. The internal hyperlink"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we have to create the hyperlink to the **``** anchor tag that we just created. \n",
+ "We can either do this in ye goode olde HTML where we put a fragment identifier in form of a hash mark (`#`) in front of the name, \n",
+ "for example, **`Link to the destination'`**\n",
+ "\n",
+ "Or alternatively, we can just use the slightly more convenient Markdown syntax: \n",
+ "**`[Link to the destination](#the_destination)`**\n",
+ "\n",
+ "**That's all!**\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# One more piece of advice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Of course it would make sense to place the empty anchor tags for you table of contents just on top of each cell that contains a heading. \n",
+ "E.g., "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`` \n",
+ "`###Section 2` \n",
+ "`some text ...` "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And I did this for a very long time ... until I figured out that it wouldn't render the Markdown properly if you convert the IPython Notebook into HTML (for example, for printing via the print preview option). \n",
+ "\n",
+ "But instead of "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###Section 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "it would be rendered as"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`###Section 2`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "which is certainly not what we want (note that it looks normal in the IPython notebook, but not in the converted HTML version). So my favorite remedy would be to put the `id`-anchor tag into a separate cell just above the section, ideally with some line breaks for nicer visuals.\n",
+ "\n",
+ "\n",
+ "\n",
+ "### Solution 1: id-anchor tag in a separate cell\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "### Solution 2: line break between the id-anchor and text:\n",
+ "\n",
+ "\n",
+ "\n",
+ "(this alternative workaround was kindly submitted by [Ryan Morshead](https://github.com/rmorshea))\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Solution 3: using header cells"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Alternatively, and I think this is an even better solution, is to use header cells.\n",
+ "
\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To define the hyperlink anchor tag to this \"header cell\" is just the text content of the \"header cell\" connected by dashes. E.g.,\n",
+ "\n",
+ "`[link to another section](#Another-section)`\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[Click this link and jump to the top of the page](#top)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can't see it, but this cell contains a \n",
+ "`` \n",
+ "anchor tag just below this text.\n",
+ ""
+ ]
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/tutorials/things_in_pandas.ipynb b/tutorials/things_in_pandas.ipynb
new file mode 100644
index 0000000..968d734
--- /dev/null
+++ b/tutorials/things_in_pandas.ipynb
@@ -0,0 +1,3201 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[Back to the GitHub repository](https://github.com/rasbt/python_reference)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sebastian Raschka 28/01/2015 \n",
+ "\n",
+ "CPython 3.4.2\n",
+ "IPython 2.3.1\n",
+ "\n",
+ "pandas 0.15.2\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext watermark\n",
+ "%watermark -a 'Sebastian Raschka' -v -d -p pandas"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Things in Pandas I Wish I'd Known Earlier"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is just a small but growing collection of pandas snippets that I find occasionally and particularly useful -- consider it as my personal notebook. Suggestions, tips, and contributions are very, very welcome!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sections"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- [Loading Some Example Data](#Loading-Some-Example-Data)\n",
+ "- [Renaming Columns](#Renaming-Columns)\n",
+ " - [Converting Column Names to Lowercase](#Converting-Column-Names-to-Lowercase)\n",
+ " - [Renaming Particular Columns](#Renaming-Particular-Columns)\n",
+ "- [Applying Computations Rows-wise](#Applying-Computations-Rows-wise)\n",
+ " - [Changing Values in a Column](#Changing-Values-in-a-Column)\n",
+ " - [Adding a New Column](#Adding-a-New-Column)\n",
+ " - [Applying Functions to Multiple Columns](#Applying-Functions-to-Multiple-Columns)\n",
+ "- [Missing Values aka NaNs](#Missing-Values-aka-NaNs)\n",
+ " - [Counting Rows with NaNs](#Counting-Rows-with-NaNs)\n",
+ " - [Selecting NaN Rows](#Selecting-NaN-Rows)\n",
+ " - [Selecting non-NaN Rows](#Selecting-non-NaN-Rows)\n",
+ " - [Filling NaN Rows](#Filling-NaN-Rows)\n",
+ "- [Appending Rows to a DataFrame](#Appending-Rows-to-a-DataFrame)\n",
+ "- [Sorting and Reindexing DataFrames](#Sorting-and-Reindexing-DataFrames)\n",
+ "- [Updating Columns](#Updating-Columns)\n",
+ "- [Chaining Conditions - Using Bitwise Operators](#Chaining-Conditions---Using-Bitwise-Operators)\n",
+ "- [Column Types](#Column-Types)\n",
+ " - [Printing Column Types](#Printing-Column-Types)\n",
+ " - [Selecting by Column Type](#Selecting-by-Column-Type)\n",
+ " - [Converting Column Types](#Converting-Column-Types)\n",
+ "- [If-tests](#If-tests)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Loading Some Example Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "I am heavily into sports prediction (via a machine learning approach) these days. So, let us use a (very) small subset of the soccer data that I am just working with."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " PLAYER \n",
+ " SALARY \n",
+ " GP \n",
+ " G \n",
+ " A \n",
+ " SOT \n",
+ " PPG \n",
+ " P \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Sergio Agüero\\n Forward — Manchester City \n",
+ " $19.2m \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Eden Hazard\\n Midfield — Chelsea \n",
+ " $18.9m \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Alexis Sánchez\\n Forward — Arsenal \n",
+ " $17.6m \n",
+ " NaN \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Yaya Touré\\n Midfield — Manchester City \n",
+ " $16.6m \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Ángel Di María\\n Midfield — Manchester United \n",
+ " $15.0m \n",
+ " 13 \n",
+ " 3 \n",
+ " NaN \n",
+ " 13 \n",
+ " 10.17 \n",
+ " 132.23 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Santiago Cazorla\\n Midfield — Arsenal \n",
+ " $14.8m \n",
+ " 20 \n",
+ " 4 \n",
+ " NaN \n",
+ " 20 \n",
+ " 9.97 \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " David Silva\\n Midfield — Manchester City \n",
+ " $14.3m \n",
+ " 15 \n",
+ " 6 \n",
+ " 2 \n",
+ " 11 \n",
+ " 10.35 \n",
+ " 155.26 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas\\n Midfield — Chelsea \n",
+ " $14.0m \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino\\n Forward — West Brom \n",
+ " $13.8m \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard\\n Midfield — Liverpool \n",
+ " $13.8m \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " PLAYER SALARY GP G A SOT \\\n",
+ "0 Sergio Agüero\\n Forward — Manchester City $19.2m 16 14 3 34 \n",
+ "1 Eden Hazard\\n Midfield — Chelsea $18.9m 21 8 4 17 \n",
+ "2 Alexis Sánchez\\n Forward — Arsenal $17.6m NaN 12 7 29 \n",
+ "3 Yaya Touré\\n Midfield — Manchester City $16.6m 18 7 1 19 \n",
+ "4 Ángel Di María\\n Midfield — Manchester United $15.0m 13 3 NaN 13 \n",
+ "5 Santiago Cazorla\\n Midfield — Arsenal $14.8m 20 4 NaN 20 \n",
+ "6 David Silva\\n Midfield — Manchester City $14.3m 15 6 2 11 \n",
+ "7 Cesc Fàbregas\\n Midfield — Chelsea $14.0m 20 2 14 10 \n",
+ "8 Saido Berahino\\n Forward — West Brom $13.8m 21 9 0 20 \n",
+ "9 Steven Gerrard\\n Midfield — Liverpool $13.8m 20 5 1 11 \n",
+ "\n",
+ " PPG P \n",
+ "0 13.12 209.98 \n",
+ "1 13.05 274.04 \n",
+ "2 11.19 223.86 \n",
+ "3 10.99 197.91 \n",
+ "4 10.17 132.23 \n",
+ "5 9.97 NaN \n",
+ "6 10.35 155.26 \n",
+ "7 10.47 209.49 \n",
+ "8 7.02 147.43 \n",
+ "9 7.50 150.01 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Renaming Columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Converting Column Names to Lowercase"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " gp \n",
+ " g \n",
+ " a \n",
+ " sot \n",
+ " ppg \n",
+ " p \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas\\n Midfield — Chelsea \n",
+ " $14.0m \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino\\n Forward — West Brom \n",
+ " $13.8m \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard\\n Midfield — Liverpool \n",
+ " $13.8m \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary gp g a sot ppg \\\n",
+ "7 Cesc Fàbregas\\n Midfield — Chelsea $14.0m 20 2 14 10 10.47 \n",
+ "8 Saido Berahino\\n Forward — West Brom $13.8m 21 9 0 20 7.02 \n",
+ "9 Steven Gerrard\\n Midfield — Liverpool $13.8m 20 5 1 11 7.50 \n",
+ "\n",
+ " p \n",
+ "7 209.49 \n",
+ "8 147.43 \n",
+ "9 150.01 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Converting column names to lowercase\n",
+ "\n",
+ "df.columns = [c.lower() for c in df.columns]\n",
+ "\n",
+ "# or\n",
+ "# df.rename(columns=lambda x : x.lower())\n",
+ "\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Renaming Particular Columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas\\n Midfield — Chelsea \n",
+ " $14.0m \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino\\n Forward — West Brom \n",
+ " $13.8m \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard\\n Midfield — Liverpool \n",
+ " $13.8m \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists \\\n",
+ "7 Cesc Fàbregas\\n Midfield — Chelsea $14.0m 20 2 14 \n",
+ "8 Saido Berahino\\n Forward — West Brom $13.8m 21 9 0 \n",
+ "9 Steven Gerrard\\n Midfield — Liverpool $13.8m 20 5 1 \n",
+ "\n",
+ " shots_on_target points_per_game points \n",
+ "7 10 10.47 209.49 \n",
+ "8 20 7.02 147.43 \n",
+ "9 11 7.50 150.01 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.rename(columns={'p': 'points', \n",
+ " 'gp': 'games',\n",
+ " 'sot': 'shots_on_target',\n",
+ " 'g': 'goals',\n",
+ " 'ppg': 'points_per_game',\n",
+ " 'a': 'assists',})\n",
+ "\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Applying Computations Rows-wise"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Changing Values in a Column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Santiago Cazorla\\n Midfield — Arsenal \n",
+ " 14.8 \n",
+ " 20 \n",
+ " 4 \n",
+ " NaN \n",
+ " 20 \n",
+ " 9.97 \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " David Silva\\n Midfield — Manchester City \n",
+ " 14.3 \n",
+ " 15 \n",
+ " 6 \n",
+ " 2 \n",
+ " 11 \n",
+ " 10.35 \n",
+ " 155.26 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas\\n Midfield — Chelsea \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino\\n Forward — West Brom \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard\\n Midfield — Liverpool \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists \\\n",
+ "5 Santiago Cazorla\\n Midfield — Arsenal 14.8 20 4 NaN \n",
+ "6 David Silva\\n Midfield — Manchester City 14.3 15 6 2 \n",
+ "7 Cesc Fàbregas\\n Midfield — Chelsea 14.0 20 2 14 \n",
+ "8 Saido Berahino\\n Forward — West Brom 13.8 21 9 0 \n",
+ "9 Steven Gerrard\\n Midfield — Liverpool 13.8 20 5 1 \n",
+ "\n",
+ " shots_on_target points_per_game points \n",
+ "5 20 9.97 NaN \n",
+ "6 11 10.35 155.26 \n",
+ "7 10 10.47 209.49 \n",
+ "8 20 7.02 147.43 \n",
+ "9 11 7.50 150.01 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Processing `salary` column\n",
+ "\n",
+ "df['salary'] = df['salary'].apply(lambda x: x.strip('$m'))\n",
+ "df.tail()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Adding a New Column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas\\n Midfield — Chelsea \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino\\n Forward — West Brom \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard\\n Midfield — Liverpool \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists \\\n",
+ "7 Cesc Fàbregas\\n Midfield — Chelsea 14.0 20 2 14 \n",
+ "8 Saido Berahino\\n Forward — West Brom 13.8 21 9 0 \n",
+ "9 Steven Gerrard\\n Midfield — Liverpool 13.8 20 5 1 \n",
+ "\n",
+ " shots_on_target points_per_game points position team \n",
+ "7 10 10.47 209.49 \n",
+ "8 20 7.02 147.43 \n",
+ "9 11 7.50 150.01 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['team'] = pd.Series('', index=df.index)\n",
+ "\n",
+ "# or\n",
+ "df.insert(loc=8, column='position', value='') \n",
+ "\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Cesc Fàbregas \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " Midfield \n",
+ " Chelsea \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Saido Berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " Forward \n",
+ " West Brom \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Steven Gerrard \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " Midfield \n",
+ " Liverpool \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "7 Cesc Fàbregas 14.0 20 2 14 10 \n",
+ "8 Saido Berahino 13.8 21 9 0 20 \n",
+ "9 Steven Gerrard 13.8 20 5 1 11 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "7 10.47 209.49 Midfield Chelsea \n",
+ "8 7.02 147.43 Forward West Brom \n",
+ "9 7.50 150.01 Midfield Liverpool "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Processing `player` column\n",
+ "\n",
+ "def process_player_col(text):\n",
+ " name, rest = text.split('\\n')\n",
+ " position, team = [x.strip() for x in rest.split(' — ')]\n",
+ " return pd.Series([name, team, position])\n",
+ "\n",
+ "df[['player', 'team', 'position']] = df.player.apply(process_player_col)\n",
+ "\n",
+ "# modified after tip from reddit.com/user/hharison\n",
+ "#\n",
+ "# Alternative (inferior) approach:\n",
+ "#\n",
+ "#for idx,row in df.iterrows():\n",
+ "# name, position, team = process_player_col(row['player'])\n",
+ "# df.ix[idx, 'player'], df.ix[idx, 'position'], df.ix[idx, 'team'] = name, position, team\n",
+ " \n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Applying Functions to Multiple Columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " NaN \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " ángel di maría \n",
+ " 15.0 \n",
+ " 13 \n",
+ " 3 \n",
+ " NaN \n",
+ " 13 \n",
+ " 10.17 \n",
+ " 132.23 \n",
+ " midfield \n",
+ " manchester united \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "0 sergio agüero 19.2 16 14 3 34 \n",
+ "1 eden hazard 18.9 21 8 4 17 \n",
+ "2 alexis sánchez 17.6 NaN 12 7 29 \n",
+ "3 yaya touré 16.6 18 7 1 19 \n",
+ "4 ángel di maría 15.0 13 3 NaN 13 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "0 13.12 209.98 forward manchester city \n",
+ "1 13.05 274.04 midfield chelsea \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "3 10.99 197.91 midfield manchester city \n",
+ "4 10.17 132.23 midfield manchester united "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cols = ['player', 'position', 'team']\n",
+ "df[cols] = df[cols].applymap(lambda x: x.lower())\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Missing Values aka NaNs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Counting Rows with NaNs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3 rows have missing values\n"
+ ]
+ }
+ ],
+ "source": [
+ "nans = df.shape[0] - df.dropna().shape[0]\n",
+ "\n",
+ "print('%d rows have missing values' % nans)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Selecting NaN Rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " ángel di maría \n",
+ " 15.0 \n",
+ " 13 \n",
+ " 3 \n",
+ " NaN \n",
+ " 13 \n",
+ " 10.17 \n",
+ " 132.23 \n",
+ " midfield \n",
+ " manchester united \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " santiago cazorla \n",
+ " 14.8 \n",
+ " 20 \n",
+ " 4 \n",
+ " NaN \n",
+ " 20 \n",
+ " 9.97 \n",
+ " NaN \n",
+ " midfield \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "4 ángel di maría 15.0 13 3 NaN 13 \n",
+ "5 santiago cazorla 14.8 20 4 NaN 20 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "4 10.17 132.23 midfield manchester united \n",
+ "5 9.97 NaN midfield arsenal "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Selecting all rows that have NaNs in the `assists` column\n",
+ "\n",
+ "df[df['assists'].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Selecting non-NaN Rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " NaN \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " david silva \n",
+ " 14.3 \n",
+ " 15 \n",
+ " 6 \n",
+ " 2 \n",
+ " 11 \n",
+ " 10.35 \n",
+ " 155.26 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " cesc fàbregas \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " steven gerrard \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " midfield \n",
+ " liverpool \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "0 sergio agüero 19.2 16 14 3 34 \n",
+ "1 eden hazard 18.9 21 8 4 17 \n",
+ "2 alexis sánchez 17.6 NaN 12 7 29 \n",
+ "3 yaya touré 16.6 18 7 1 19 \n",
+ "6 david silva 14.3 15 6 2 11 \n",
+ "7 cesc fàbregas 14.0 20 2 14 10 \n",
+ "8 saido berahino 13.8 21 9 0 20 \n",
+ "9 steven gerrard 13.8 20 5 1 11 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "0 13.12 209.98 forward manchester city \n",
+ "1 13.05 274.04 midfield chelsea \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "3 10.99 197.91 midfield manchester city \n",
+ "6 10.35 155.26 midfield manchester city \n",
+ "7 10.47 209.49 midfield chelsea \n",
+ "8 7.02 147.43 forward west brom \n",
+ "9 7.50 150.01 midfield liverpool "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df['assists'].notnull()]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Filling NaN Rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " ángel di maría \n",
+ " 15.0 \n",
+ " 13 \n",
+ " 3 \n",
+ " 0 \n",
+ " 13 \n",
+ " 10.17 \n",
+ " 132.23 \n",
+ " midfield \n",
+ " manchester united \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " santiago cazorla \n",
+ " 14.8 \n",
+ " 20 \n",
+ " 4 \n",
+ " 0 \n",
+ " 20 \n",
+ " 9.97 \n",
+ " 0.00 \n",
+ " midfield \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " david silva \n",
+ " 14.3 \n",
+ " 15 \n",
+ " 6 \n",
+ " 2 \n",
+ " 11 \n",
+ " 10.35 \n",
+ " 155.26 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " cesc fàbregas \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " steven gerrard \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " midfield \n",
+ " liverpool \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "0 sergio agüero 19.2 16 14 3 34 \n",
+ "1 eden hazard 18.9 21 8 4 17 \n",
+ "2 alexis sánchez 17.6 0 12 7 29 \n",
+ "3 yaya touré 16.6 18 7 1 19 \n",
+ "4 ángel di maría 15.0 13 3 0 13 \n",
+ "5 santiago cazorla 14.8 20 4 0 20 \n",
+ "6 david silva 14.3 15 6 2 11 \n",
+ "7 cesc fàbregas 14.0 20 2 14 10 \n",
+ "8 saido berahino 13.8 21 9 0 20 \n",
+ "9 steven gerrard 13.8 20 5 1 11 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "0 13.12 209.98 forward manchester city \n",
+ "1 13.05 274.04 midfield chelsea \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "3 10.99 197.91 midfield manchester city \n",
+ "4 10.17 132.23 midfield manchester united \n",
+ "5 9.97 0.00 midfield arsenal \n",
+ "6 10.35 155.26 midfield manchester city \n",
+ "7 10.47 209.49 midfield chelsea \n",
+ "8 7.02 147.43 forward west brom \n",
+ "9 7.50 150.01 midfield liverpool "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Filling NaN cells with default value 0\n",
+ "\n",
+ "df.fillna(value=0, inplace=True)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Appending Rows to a DataFrame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " steven gerrard \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " midfield \n",
+ " liverpool \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "8 saido berahino 13.8 21 9 0 20 \n",
+ "9 steven gerrard 13.8 20 5 1 11 \n",
+ "10 NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " points_per_game points position team \n",
+ "8 7.02 147.43 forward west brom \n",
+ "9 7.50 150.01 midfield liverpool \n",
+ "10 NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Adding an \"empty\" row to the DataFrame\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "df = df.append(pd.Series(\n",
+ " [np.nan]*len(df.columns), # Fill cells with NaNs\n",
+ " index=df.columns), \n",
+ " ignore_index=True)\n",
+ "\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " steven gerrard \n",
+ " 13.8 \n",
+ " 20 \n",
+ " 5 \n",
+ " 1 \n",
+ " 11 \n",
+ " 7.50 \n",
+ " 150.01 \n",
+ " midfield \n",
+ " liverpool \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " new player \n",
+ " 12.3 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "8 saido berahino 13.8 21 9 0 20 \n",
+ "9 steven gerrard 13.8 20 5 1 11 \n",
+ "10 new player 12.3 NaN NaN NaN NaN \n",
+ "\n",
+ " points_per_game points position team \n",
+ "8 7.02 147.43 forward west brom \n",
+ "9 7.50 150.01 midfield liverpool \n",
+ "10 NaN NaN NaN NaN "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Filling cells with data\n",
+ "\n",
+ "df.loc[df.index[-1], 'player'] = 'new player'\n",
+ "df.loc[df.index[-1], 'salary'] = 12.3\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sorting and Reindexing DataFrames"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "0 sergio agüero 19.2 16 14 3 34 \n",
+ "2 alexis sánchez 17.6 0 12 7 29 \n",
+ "8 saido berahino 13.8 21 9 0 20 \n",
+ "1 eden hazard 18.9 21 8 4 17 \n",
+ "3 yaya touré 16.6 18 7 1 19 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "0 13.12 209.98 forward manchester city \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "8 7.02 147.43 forward west brom \n",
+ "1 13.05 274.04 midfield chelsea \n",
+ "3 10.99 197.91 midfield manchester city "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Sorting the DataFrame by a certain column (from highest to lowest)\n",
+ "\n",
+ "df.sort('goals', ascending=False, inplace=True)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " 18 \n",
+ " 7 \n",
+ " 1 \n",
+ " 19 \n",
+ " 10.99 \n",
+ " 197.91 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "1 sergio agüero 19.2 16 14 3 34 \n",
+ "2 alexis sánchez 17.6 0 12 7 29 \n",
+ "3 saido berahino 13.8 21 9 0 20 \n",
+ "4 eden hazard 18.9 21 8 4 17 \n",
+ "5 yaya touré 16.6 18 7 1 19 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "1 13.12 209.98 forward manchester city \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "3 7.02 147.43 forward west brom \n",
+ "4 13.05 274.04 midfield chelsea \n",
+ "5 10.99 197.91 midfield manchester city "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Optional reindexing of the DataFrame after sorting\n",
+ "\n",
+ "df.index = range(1,len(df.index)+1)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Updating Columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " sergio agüero \n",
+ " 20 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "1 sergio agüero 20 16 14 3 34 \n",
+ "2 alexis sánchez 15 0 12 7 29 \n",
+ "3 saido berahino 13.8 21 9 0 20 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "1 13.12 209.98 forward manchester city \n",
+ "2 11.19 223.86 forward arsenal \n",
+ "3 7.02 147.43 forward west brom "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Creating a dummy DataFrame with changes in the `salary` column\n",
+ "\n",
+ "df_2 = df.copy()\n",
+ "df_2.loc[0:2, 'salary'] = [20.0, 15.0]\n",
+ "df_2.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " player \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " sergio agüero \n",
+ " 19.2 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " alexis sánchez \n",
+ " 17.6 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " salary games goals assists shots_on_target \\\n",
+ "player \n",
+ "sergio agüero 19.2 16 14 3 34 \n",
+ "alexis sánchez 17.6 0 12 7 29 \n",
+ "saido berahino 13.8 21 9 0 20 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "player \n",
+ "sergio agüero 13.12 209.98 forward manchester city \n",
+ "alexis sánchez 11.19 223.86 forward arsenal \n",
+ "saido berahino 7.02 147.43 forward west brom "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Temporarily use the `player` columns as indices to \n",
+ "# apply the update functions\n",
+ "\n",
+ "df.set_index('player', inplace=True)\n",
+ "df_2.set_index('player', inplace=True)\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " player \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " sergio agüero \n",
+ " 20 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " salary games goals assists shots_on_target \\\n",
+ "player \n",
+ "sergio agüero 20 16 14 3 34 \n",
+ "alexis sánchez 15 0 12 7 29 \n",
+ "saido berahino 13.8 21 9 0 20 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "player \n",
+ "sergio agüero 13.12 209.98 forward manchester city \n",
+ "alexis sánchez 11.19 223.86 forward arsenal \n",
+ "saido berahino 7.02 147.43 forward west brom "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Update the `salary` column\n",
+ "df.update(other=df_2['salary'], overwrite=True)\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 20 \n",
+ " 16 \n",
+ " 14 \n",
+ " 3 \n",
+ " 34 \n",
+ " 13.12 \n",
+ " 209.98 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " 21 \n",
+ " 9 \n",
+ " 0 \n",
+ " 20 \n",
+ " 7.02 \n",
+ " 147.43 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "0 sergio agüero 20 16 14 3 34 \n",
+ "1 alexis sánchez 15 0 12 7 29 \n",
+ "2 saido berahino 13.8 21 9 0 20 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "0 13.12 209.98 forward manchester city \n",
+ "1 11.19 223.86 forward arsenal \n",
+ "2 7.02 147.43 forward west brom "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Reset the indices\n",
+ "df.reset_index(inplace=True)\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Chaining Conditions - Using Bitwise Operators"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " 21 \n",
+ " 8 \n",
+ " 4 \n",
+ " 17 \n",
+ " 13.05 \n",
+ " 274.04 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " santiago cazorla \n",
+ " 14.8 \n",
+ " 20 \n",
+ " 4 \n",
+ " 0 \n",
+ " 20 \n",
+ " 9.97 \n",
+ " 0.00 \n",
+ " midfield \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " cesc fàbregas \n",
+ " 14.0 \n",
+ " 20 \n",
+ " 2 \n",
+ " 14 \n",
+ " 10 \n",
+ " 10.47 \n",
+ " 209.49 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "1 alexis sánchez 15 0 12 7 29 \n",
+ "3 eden hazard 18.9 21 8 4 17 \n",
+ "7 santiago cazorla 14.8 20 4 0 20 \n",
+ "9 cesc fàbregas 14.0 20 2 14 10 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "1 11.19 223.86 forward arsenal \n",
+ "3 13.05 274.04 midfield chelsea \n",
+ "7 9.97 0.00 midfield arsenal \n",
+ "9 10.47 209.49 midfield chelsea "
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Selecting only those players that either playing for Arsenal or Chelsea\n",
+ "\n",
+ "df[ (df['team'] == 'arsenal') | (df['team'] == 'chelsea') ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " games \n",
+ " goals \n",
+ " assists \n",
+ " shots_on_target \n",
+ " points_per_game \n",
+ " points \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " 0 \n",
+ " 12 \n",
+ " 7 \n",
+ " 29 \n",
+ " 11.19 \n",
+ " 223.86 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary games goals assists shots_on_target \\\n",
+ "1 alexis sánchez 15 0 12 7 29 \n",
+ "\n",
+ " points_per_game points position team \n",
+ "1 11.19 223.86 forward arsenal "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Selecting forwards from Arsenal only\n",
+ "\n",
+ "df[ (df['team'] == 'arsenal') & (df['position'] == 'forward') ]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Column Types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Printing Column Types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{dtype('float64'): ['games',\n",
+ " 'goals',\n",
+ " 'assists',\n",
+ " 'shots_on_target',\n",
+ " 'points_per_game',\n",
+ " 'points'],\n",
+ " dtype('O'): ['player', 'salary', 'position', 'team']}"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "types = df.columns.to_series().groupby(df.dtypes).groups\n",
+ "types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Selecting by Column Type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " player \n",
+ " salary \n",
+ " position \n",
+ " team \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " sergio agüero \n",
+ " 20 \n",
+ " forward \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " alexis sánchez \n",
+ " 15 \n",
+ " forward \n",
+ " arsenal \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " saido berahino \n",
+ " 13.8 \n",
+ " forward \n",
+ " west brom \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " eden hazard \n",
+ " 18.9 \n",
+ " midfield \n",
+ " chelsea \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " yaya touré \n",
+ " 16.6 \n",
+ " midfield \n",
+ " manchester city \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " player salary position team\n",
+ "0 sergio agüero 20 forward manchester city\n",
+ "1 alexis sánchez 15 forward arsenal\n",
+ "2 saido berahino 13.8 forward west brom\n",
+ "3 eden hazard 18.9 midfield chelsea\n",
+ "4 yaya touré 16.6 midfield manchester city"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# select string columns\n",
+ "df.loc[:, (df.dtypes == np.dtype('O')).values].head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Converting Column Types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['salary'] = df['salary'].astype(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{dtype('float64'): ['salary',\n",
+ " 'games',\n",
+ " 'goals',\n",
+ " 'assists',\n",
+ " 'shots_on_target',\n",
+ " 'points_per_game',\n",
+ " 'points'],\n",
+ " dtype('O'): ['player', 'position', 'team']}"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "types = df.columns.to_series().groupby(df.dtypes).groups\n",
+ "types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# If-tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to section overview](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "I was recently asked how to do an if-test in pandas, that is, how to create an array of 1s and 0s depending on a condition, e.g., if `val` less than 0.5 -> 0, else -> 1. Using the boolean mask, that's pretty simple since `True` and `False` are integers after all."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "int(True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 2.0 \n",
+ " 0.30 \n",
+ " 4.00 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0.8 \n",
+ " 0.03 \n",
+ " 0.02 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " 0 1 2 3\n",
+ "0 2.0 0.30 4.00 5\n",
+ "1 0.8 0.03 0.02 5"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "a = [[2., .3, 4., 5.], [.8, .03, 0.02, 5.]]\n",
+ "df = pd.DataFrame(a)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " 0 1 2 3\n",
+ "0 False False False False\n",
+ "1 False True True False"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df <= 0.05\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " 0 1 2 3\n",
+ "0 0 0 0 0\n",
+ "1 0 1 1 0"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.4.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tutorials/useful_regex.ipynb b/tutorials/useful_regex.ipynb
new file mode 100644
index 0000000..24bcf14
--- /dev/null
+++ b/tutorials/useful_regex.ipynb
@@ -0,0 +1,1070 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:237609a5ef934bf65a93a410c9e5107b808049dd04b0faf2b30f9b423699ba6c"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[Sebastian Raschka](http://sebastianraschka.com) \n",
+ "\n",
+ "- [Link to this IPython notebook on Github](https://github.com/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "%load_ext watermark"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 1
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "%watermark -d -v -u -t -z"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "Last updated: 06/07/2014 22:50:23 EDT\n",
+ "\n",
+ "CPython 3.4.1\n",
+ "IPython 2.1.0\n"
+ ]
+ }
+ ],
+ "prompt_number": 2
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "I would be happy to hear your comments and suggestions. \n",
+ "Please feel free to drop me a note via\n",
+ "[twitter](https://twitter.com/rasbt), [email](mailto:bluewoodtree@gmail.com), or [google+](https://plus.google.com/+SebastianRaschka).\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 1,
+ "metadata": {},
+ "source": [
+ "A collection of useful regular expressions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Sections"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- [About the `re` module](#About-the-re-module)\n",
+ "- [Identify files via file extensions](#Identify-files-via-file-extensions)\n",
+ "- [Username validation](#Username-validation)\n",
+ "- [Checking for valid email addresses](#Checking-for-valid-email-addresses)\n",
+ "- [Check for a valid URL](#Check-for-a-valid-URL)\n",
+ "- [Checking for numbers](#Checking-for-numbers)\n",
+ "- [Validating dates](#Validating-dates)\n",
+ "- [Time](#Time)\n",
+ "- [Checking for HTML tags](#Checking-for-HTML-tags)\n",
+ "- [Checking for IP addresses](#Checking-for-IP-addresses)\n",
+ "- [Checking for MAC addresses](#Checking-for-MAC-addresses)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "About the `re` module"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The purpose of this IPython notebook is not to rewrite a detailed tutorial about regular expressions or the in-built Python `re` module, but to collect some useful regular expressions for copy&paste purposes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The complete documentation of the Python `re` module can be found here [https://docs.python.org/3.4/howto/regex.html](https://docs.python.org/3.4/howto/regex.html). Below, I just want to list the most important methods for convenience:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- `re.match()` : Determine if the RE matches at the beginning of the string.\n",
+ "- `re.search()` : Scan through a string, looking for any location where this RE matches.\n",
+ "- `re.findall()` : Find all substrings where the RE matches, and returns them as a list.\n",
+ "- `re.finditer()` : Find all substrings where the RE matches, and returns them as an iterator."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you are using the same regular expression multiple times, it is recommended to compile it for improved performance.\n",
+ "\n",
+ " compiled_re = re.compile(r'some_regexpr') \n",
+ " for word in text:\n",
+ " match = comp.search(compiled_re))\n",
+ " # do something with the match\n",
+ " \n",
+ "**E.g., if we want to check if a string ends with a substring:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import re\n",
+ "\n",
+ "needle = 'needlers'\n",
+ "\n",
+ "# Python approach\n",
+ "print(bool(any([needle.endswith(e) for e in ('ly', 'ed', 'ing', 'ers')])))\n",
+ "\n",
+ "# On-the-fly Regular expression in Python\n",
+ "print(bool(re.search(r'(?:ly|ed|ing|ers)$', needle)))\n",
+ "\n",
+ "# Compiled Regular expression in Python\n",
+ "comp = re.compile(r'(?:ly|ed|ing|ers)$') \n",
+ "print(bool(comp.search(needle)))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "True\n",
+ "True\n",
+ "True\n"
+ ]
+ }
+ ],
+ "prompt_number": 3
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "%timeit -n 10000 -r 50 bool(any([needle.endswith(e) for e in ('ly', 'ed', 'ing', 'ers')]))\n",
+ "%timeit -n 10000 -r 50 bool(re.search(r'(?:ly|ed|ing|ers)$', needle))\n",
+ "%timeit -n 10000 -r 50 bool(comp.search(needle))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "10000 loops, best of 50: 2.74 \u00b5s per loop\n",
+ "10000 loops, best of 50: 2.93 \u00b5s per loop"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "10000 loops, best of 50: 1.28 \u00b5s per loop"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "prompt_number": 4
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Identify files via file extensions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A regular expression to check for file extensions. \n",
+ "\n",
+ "Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'(?i)(\\w+)\\.(jpeg|jpg|png|gif|tif|svg)$'\n",
+ "\n",
+ "# remove `(?i)` to make regexpr case-sensitive\n",
+ "\n",
+ "str_true = ('test.gif', \n",
+ " 'image.jpeg', \n",
+ " 'image.jpg',\n",
+ " 'image.TIF'\n",
+ " )\n",
+ "\n",
+ "str_false = ('test.pdf',\n",
+ " 'test.gif.pdf',\n",
+ " )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 5
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Username validation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Checking for a valid user name that has a certain minimum and maximum length.\n",
+ "\n",
+ "Allowed characters:\n",
+ "- letters (upper- and lower-case)\n",
+ "- numbers\n",
+ "- dashes\n",
+ "- underscores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "min_len = 5 # minimum length for a valid username\n",
+ "max_len = 15 # maximum length for a valid username\n",
+ "\n",
+ "pattern = r\"^(?i)[a-z0-9_-]{%s,%s}$\" %(min_len, max_len)\n",
+ "\n",
+ "# remove `(?i)` to only allow lower-case letters\n",
+ "\n",
+ "\n",
+ "\n",
+ "str_true = ('user123', '123_user', 'Username')\n",
+ " \n",
+ "str_false = ('user', 'username1234_is-way-too-long', 'user$34354')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 6
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Checking for valid email addresses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A regular expression that captures most email addresses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r\"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$)\"\n",
+ "\n",
+ "str_true = ('test@mail.com',)\n",
+ " \n",
+ "str_false = ('testmail.com', '@testmail.com', 'test@mailcom')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 7
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://stackoverflow.com/questions/201323/using-a-regular-expression-to-validate-an-email-address](http://stackoverflow.com/questions/201323/using-a-regular-expression-to-validate-an-email-address)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Check for a valid URL"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Checks for an URL if a string ...\n",
+ "\n",
+ "- starts with `https://`, or `http://`, or `www.`\n",
+ "- or ends with a dot extension"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$'\n",
+ "\n",
+ "str_true = ('https://github.com', \n",
+ " 'http://github.com',\n",
+ " 'www.github.com',\n",
+ " 'github.com',\n",
+ " 'test.de',\n",
+ " 'https://github.com/rasbt',\n",
+ " 'test.jpeg' # !!! \n",
+ " )\n",
+ " \n",
+ "str_false = ('testmailcom', 'http:testmailcom', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 8
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://code.tutsplus.com/tutorials/8-regular-expressions-you-should-know--net-6149](http://code.tutsplus.com/tutorials/8-regular-expressions-you-should-know--net-6149)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Checking for numbers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Positive integers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^\\d+$'\n",
+ "\n",
+ "str_true = ('123', '1', )\n",
+ " \n",
+ "str_false = ('abc', '1.1', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 9
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Negative integers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^-\\d+$'\n",
+ "\n",
+ "str_true = ('-123', '-1', )\n",
+ " \n",
+ "str_false = ('123', '-abc', '-1.1', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 10
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "All integers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^-{0,1}\\d+$'\n",
+ "\n",
+ "str_true = ('-123', '-1', '1', '123',)\n",
+ " \n",
+ "str_false = ('123.0', '-abc', '-1.1', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 11
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Positive numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^\\d*\\.{0,1}\\d+$'\n",
+ "\n",
+ "str_true = ('1', '123', '1.234', )\n",
+ " \n",
+ "str_false = ('-abc', '-123', '-123.0')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 12
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Negative numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^-\\d*\\.{0,1}\\d+$'\n",
+ "\n",
+ "str_true = ('-1', '-123', '-123.0', )\n",
+ " \n",
+ "str_false = ('-abc', '1', '123', '1.234', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 13
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "All numbers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^-{0,1}\\d*\\.{0,1}\\d+$'\n",
+ "\n",
+ "str_true = ('1', '123', '1.234', '-123', '-123.0')\n",
+ " \n",
+ "str_false = ('-abc')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 14
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://stackoverflow.com/questions/1449817/what-are-some-of-the-most-useful-regular-expressions-for-programmers](http://stackoverflow.com/questions/1449817/what-are-some-of-the-most-useful-regular-expressions-for-programmers)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Validating dates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Validates dates in `mm/dd/yyyy` format."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = '^(0[1-9]|1[0-2])\\/(0[1-9]|1\\d|2\\d|3[01])\\/(19|20)\\d{2}$'\n",
+ "\n",
+ "str_true = ('01/08/2014', '12/30/2014', )\n",
+ " \n",
+ "str_false = ('22/08/2014', '-123', '1/8/2014', '1/08/2014', '01/8/2014')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 15
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "12-Hour format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'^(1[012]|[1-9]):[0-5][0-9](\\s)?(?i)(am|pm)$'\n",
+ "\n",
+ "str_true = ('2:00pm', '7:30 AM', '12:05 am', )\n",
+ " \n",
+ "str_false = ('22:00pm', '14:00', '3:12', '03:12pm', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 29
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "24-Hour format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'^([0-1]{1}[0-9]{1}|20|21|22|23):[0-5]{1}[0-9]{1}$'\n",
+ "\n",
+ "str_true = ('14:00', '00:30', )\n",
+ " \n",
+ "str_false = ('22:00pm', '4:00', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 18
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Checking for HTML tags"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Also this regex is only recommended for \"filtering\" purposes and not a ultimate way to parse HTML. For more information see this excellent discussion on StackOverflow: \n",
+ "[http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r\"\"\"?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n",
+ "\n",
+ "str_true = ('', '', '', '
')\n",
+ " \n",
+ "str_false = ('a>', '')\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 16
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Checking for IP addresses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "IPv4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Image source: http://en.wikipedia.org/wiki/File:Ipv4_address.svg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'\n",
+ "\n",
+ "str_true = ('172.16.254.1', '1.2.3.4', '01.102.103.104', )\n",
+ " \n",
+ "str_false = ('17216.254.1', '1.2.3.4.5', '01 .102.103.104', )\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 8
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://answers.oreilly.com/topic/318-how-to-match-ipv4-addresses-with-regular-expressions/](http://answers.oreilly.com/topic/318-how-to-match-ipv4-addresses-with-regular-expressions/)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Ipv6"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Image source: http://upload.wikimedia.org/wikipedia/commons/1/15/Ipv6_address.svg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'^\\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?\\s*$'\n",
+ "\n",
+ "str_true = ('2001:470:9b36:1::2',\n",
+ " '2001:cdba:0000:0000:0000:0000:3257:9652', \n",
+ " '2001:cdba:0:0:0:0:3257:9652', \n",
+ " '2001:cdba::3257:9652', )\n",
+ " \n",
+ "str_false = ('1200::AB00:1234::2552:7777:1313', # uses `::` twice\n",
+ " '1200:0000:AB00:1234:O000:2552:7777:1313', ) # contains an O instead of 0\n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 21
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "source: [http://snipplr.com/view/43003/regex--match-ipv6-address/](http://snipplr.com/view/43003/regex--match-ipv6-address/)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Checking for MAC addresses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[[back to top](#Sections)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Image source: http://upload.wikimedia.org/wikipedia/en/3/37/MACaddressV3.png "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pattern = r'^(?i)([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$'\n",
+ "\n",
+ "str_true = ('94-AE-70-A0-66-83', \n",
+ " '58-f8-1a-00-44-c8',\n",
+ " '00:A0:C9:14:C8:29'\n",
+ " , )\n",
+ " \n",
+ "str_false = ('0:00:00:00:00:00', \n",
+ " '94-AE-70-A0 -66-83', ) \n",
+ "\n",
+ "for t in str_true:\n",
+ " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+ "\n",
+ "for f in str_false:\n",
+ " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 29
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/useful_scripts/combinations.py b/useful_scripts/combinations.py
new file mode 100755
index 0000000..5dbe91d
--- /dev/null
+++ b/useful_scripts/combinations.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+# Sebastian Raschka 2014
+# Functions to calculate factorial, combinations, and permutations
+# bundled in an simple command line interface.
+
+def factorial(n):
+ if n == 0:
+ return 1
+ else:
+ return n * factorial(n-1)
+
+def combinations(n, r):
+ numerator = factorial(n)
+ denominator = factorial(r) * factorial(n-r)
+ return int(numerator/denominator)
+
+def permutations(n, r):
+ numerator = factorial(n)
+ denominator = factorial(n-r)
+ return int(numerator/denominator)
+
+assert(factorial(3) == 6)
+assert(combinations(20, 8) == 125970)
+assert(permutations(30, 3) == 24360)
+
+
+
+
+if __name__ == '__main__':
+
+ import argparse
+ parser = argparse.ArgumentParser(
+ description='Script to calculate the number of combinations or permutations ("n choose r")',
+ formatter_class=argparse.RawTextHelpFormatter,
+
+ prog='Combinations',
+ epilog='Example: ./combinations.py -c 20 3'
+ )
+
+ parser.add_argument('-c', '--combinations', type=int, metavar='NUMBER', nargs=2,
+ help='Combinations: Number of ways to combine n items with sequence length r where the item order does not matter.')
+
+ parser.add_argument('-p', '--permutations', type=int, metavar='NUMBER', nargs=2,
+ help='Permutations: Number of ways to combine n items with sequence length r where the item order does not matter.')
+
+ parser.add_argument('-f', '--factorial', type=int, metavar='NUMBER', help='n! e.g., 5! = 5*4*3*2*1 = 120.')
+
+ parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+ args = parser.parse_args()
+
+ if not any((args.combinations, args.permutations, args.factorial)):
+ parser.print_help()
+ quit()
+
+ if args.factorial:
+ print(factorial(args.factorial))
+
+ if args.combinations:
+ print(combinations(args.combinations[0], args.combinations[1]))
+
+ if args.permutations:
+ print(permutations(args.permutations[0], args.permutations[1]))
+
+ if args.factorial:
+ print(factorial(args.factorial))
+
+
+
+
\ No newline at end of file
diff --git a/useful_scripts/conc_gzip_files.py b/useful_scripts/conc_gzip_files.py
index da849c9..b8d9b33 100644
--- a/useful_scripts/conc_gzip_files.py
+++ b/useful_scripts/conc_gzip_files.py
@@ -13,7 +13,7 @@ def conc_gzip_files(in_dir, out_file, append=False, print_progress=True):
Keyword arguments:
in_dir (str): Path of the directory with the gzip-files
out_file (str): Path to the resulting file
- append (bool): If true, it appends contents to an exisiting file,
+ append (bool): If true, it appends contents to an existing file,
else creates a new output file.
print_progress (bool): prints progress bar if true.
diff --git a/useful_scripts/find_file.py b/useful_scripts/find_file.py
new file mode 100644
index 0000000..8cbcc4d
--- /dev/null
+++ b/useful_scripts/find_file.py
@@ -0,0 +1,18 @@
+# Sebastian Raschka 2014
+#
+# A Python function to find files in a directory based on a substring search.
+
+
+import os
+
+def find_files(substring, path):
+ results = []
+ for f in os.listdir(path):
+ if substring in f:
+ results.append(os.path.join(path, f))
+ return results
+
+# E.g.
+# find_files('Untitled', '/Users/sebastian/Desktop/')
+# returns
+# ['/Users/sebastian/Desktop/Untitled0.ipynb']
\ No newline at end of file
diff --git a/useful_scripts/fix_tab_csv.ipynb b/useful_scripts/fix_tab_csv.ipynb
new file mode 100644
index 0000000..496f89f
--- /dev/null
+++ b/useful_scripts/fix_tab_csv.ipynb
@@ -0,0 +1,94 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:996358a25da6fc77c66d183e79209307af06bd2f9abb0656d3bb70cfc2fe597a"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Sebastian Raschka 05/09/2014"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Fixing CSV files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We have a directory `../CSV_files_raw/` with CSV files where some of them have 'tab-separated' and some of them 'comma-separated' columns. \n",
+ "Here, we will 'fix' them, i.e., have them all comma-separated, and save them to a new directory `../CSV_fixed`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "First, we create a dictionary with the file basenames as keys. The values are lists of the file paths to the raw and new fixed CSV files. e.g., \n",
+ "\n",
+ " {\n",
+ " 'abc.csv': ['../CSV_files_raw/abc.csv', '../CSV_fixed/abc.csv'], \n",
+ " 'def.csv': ['../CSV_files_raw/def.csv', '../CSV_fixed/def.csv'], \n",
+ " ...\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import sys\n",
+ "import os\n",
+ "\n",
+ "raw_dir = '../CSV_files_raw/'\n",
+ "fixed_dir = '../CSV_fixed'\n",
+ "\n",
+ "if not os.path.exists(fixed_dir):\n",
+ " os.mkdir(fixed_dir)\n",
+ "\n",
+ "f_dict = {os.path.basename(f):[os.path.join(raw_dir, f),\n",
+ " os.path.join(fixed_dir, f)]\n",
+ " for f in os.listdir(raw_dir) if f.endswith('.csv')} "
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 8
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, we can replace the tabs with commas for the new files very easily:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "for f in f_dict.keys():\n",
+ " with open(f_dict[f][0], 'r') as raw, open(f_dict[f][1], 'w') as fixed:\n",
+ " for line in raw:\n",
+ " line = line.strip().split('\\t')\n",
+ " fixed.write(','.join(line) + '\\n')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 11
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/useful_scripts/large_csv_to_sqlite.py b/useful_scripts/large_csv_to_sqlite.py
new file mode 100644
index 0000000..9932f9c
--- /dev/null
+++ b/useful_scripts/large_csv_to_sqlite.py
@@ -0,0 +1,48 @@
+# This is a workaround snippet for reading very large CSV that exceed the
+# machine's memory and dump it into an SQLite database using pandas.
+#
+# Sebastian Raschka, 2015
+#
+# Tested in Python 3.4.2 and pandas 0.15.2
+
+import pandas as pd
+import sqlite3
+from pandas.io import sql
+import subprocess
+
+# In and output file paths
+in_csv = '../data/my_large.csv'
+out_sqlite = '../data/my.sqlite'
+
+table_name = 'my_table' # name for the SQLite database table
+chunksize = 100000 # number of lines to process at each iteration
+
+# columns that should be read from the CSV file
+columns = ['molecule_id','charge','db','drugsnow','hba','hbd','loc','nrb','smiles']
+
+# Get number of lines in the CSV file
+nlines = subprocess.check_output(['wc', '-l', in_csv])
+nlines = int(nlines.split()[0])
+
+# connect to database
+cnx = sqlite3.connect(out_sqlite)
+
+# Iteratively read CSV and dump lines into the SQLite table
+for i in range(0, nlines, chunksize): # change 0 -> 1 if your csv file contains a column header
+
+ df = pd.read_csv(in_csv,
+ header=None, # no header, define column header manually later
+ nrows=chunksize, # number of rows to read at each iteration
+ skiprows=i) # skip rows that were already read
+
+ # columns to read
+ df.columns = columns
+
+ sql.to_sql(df,
+ name=table_name,
+ con=cnx,
+ index=False, # don't use CSV file index
+ index_label='molecule_id', # use a unique column from DataFrame as index
+ if_exists='append')
+cnx.close()
+
diff --git a/useful_scripts/prepend_python_shebang.sh b/useful_scripts/prepend_python_shebang.sh
new file mode 100644
index 0000000..686225f
--- /dev/null
+++ b/useful_scripts/prepend_python_shebang.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Sebastian Raschka 05/21/2014
+# Shell script that prepends a Python shebang
+# '#!/usr/bin/env python' to all
+# Python script files in the current directory
+# so that script files can be executed via
+# >> myscript.py
+# instead of
+# >> python myscript.py
+
+# prepends '#!/usr/bin/env python' to all .py files
+
+find ./ -maxdepth 1 -name "*.py" -exec sed -i.bak '1i\
+#!/usr/bin/env python
+' {} \;
+
+# removes temporary files
+find . -name "*.bak" -exec rm -rf {} \;
+
+# makes Python scripts executable
+chmod ug+x *.py
diff --git a/useful_scripts/preprocess_first_last_names.py b/useful_scripts/preprocess_first_last_names.py
new file mode 100644
index 0000000..b0957c2
--- /dev/null
+++ b/useful_scripts/preprocess_first_last_names.py
@@ -0,0 +1,79 @@
+# Sebastian Raschka 2014
+#
+# A Python function to generalize first and last names.
+# The typical use case of such a function to merge data that have been collected
+# from different sources (e.g., names of soccer players as shown in the doctest.)
+#
+
+import unicodedata
+import string
+import re
+
+def preprocess_names(name, output_sep=' ', firstname_output_letters=1):
+ """
+ Function that outputs a person's name in the format
+ (all lowercase)
+
+ >>> preprocess_names("Samuel Eto'o")
+ 'etoo s'
+
+ >>> preprocess_names("Eto'o, Samuel")
+ 'etoo s'
+
+ >>> preprocess_names("Eto'o,Samuel")
+ 'etoo s'
+
+ >>> preprocess_names('Xavi')
+ 'xavi'
+
+ >>> preprocess_names('Yaya Touré')
+ 'toure y'
+
+ >>> preprocess_names('José Ángel Pozo')
+ 'pozo j'
+
+ >>> preprocess_names('Pozo, José Ángel')
+ 'pozo j'
+
+ >>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
+ 'pozo jo'
+
+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
+ 'etoo sa'
+
+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
+ 'etoo'
+
+ >>> preprocess_names("Eto'o, Samuel", output_sep=', ')
+ 'etoo, s'
+
+ """
+
+ # set first and last name positions
+ last, first = 'last', 'first'
+ last_pos = -1
+
+ if ',' in name:
+ last, first = first, last
+ name = name.replace(',', ' ')
+ last_pos = 1
+
+ spl = name.split()
+ if len(spl) > 2:
+ name = '%s %s' % (spl[0], spl[last_pos])
+
+ # remove accents
+ name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
+
+ # get first and last name if applicable
+ m = re.match('(?P\w+)\W+(?P\w+)', name)
+ if m:
+ output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
+ else:
+ output = name
+ return output.lower().strip()
+
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
diff --git a/useful_scripts/principal_eigenvector.py b/useful_scripts/principal_eigenvector.py
new file mode 100644
index 0000000..913cf62
--- /dev/null
+++ b/useful_scripts/principal_eigenvector.py
@@ -0,0 +1,20 @@
+# Select a principal eigenvector via NumPy
+# to be used as a template (copy & paste) script
+
+import numpy as np
+
+# set A to be your matrix
+A = np.array([[1, 2, 3],
+ [4, 5, 6],
+ [7, 8, 9]])
+
+
+eig_vals, eig_vecs = np.linalg.eig(A)
+idx = np.absolute(eig_vals).argsort()[::-1] # decreasing order
+sorted_eig_vals = eig_vals[idx]
+sorted_eig_vecs = eig_vecs[:, idx]
+
+principal_eig_vec = sorted_eig_vecs[:, 0] # eigvec with largest eigval
+
+normalized_pr_eig_vec = np.real(principal_eig_vec / np.sum(principal_eig_vec))
+print(normalized_pr_eig_vec) # eigvec that sums up to one
diff --git a/useful_scripts/random_string_generator.py b/useful_scripts/random_string_generator.py
new file mode 100644
index 0000000..15cfe51
--- /dev/null
+++ b/useful_scripts/random_string_generator.py
@@ -0,0 +1,19 @@
+import string
+import random
+
+def rand_string(length):
+ """ Generates a random string of numbers, lower- and uppercase chars. """
+ return ''.join(random.choice(
+ string.ascii_lowercase + string.ascii_uppercase + string.digits)
+ for i in range(length)
+ )
+
+if __name__ == '__main__':
+ print("Example1:", rand_string(length=4))
+ print("Example2:", rand_string(length=8))
+ print("Example2:", rand_string(length=16))
+
+
+ # Example1: 5bVL
+ # Example2: oIIg37xl
+ # Example2: 7IqDbrf506TatFO9
diff --git a/useful_scripts/sparsify_matrix.py b/useful_scripts/sparsify_matrix.py
new file mode 100644
index 0000000..ef5e141
--- /dev/null
+++ b/useful_scripts/sparsify_matrix.py
@@ -0,0 +1,38 @@
+# Sebastian Raschka 2014
+#
+# Sparsifying a matrix by Zeroing out all elements but the top k elements in a row.
+# The matrix could be a distance or similarity matrix (e.g., kernel matrix in kernel PCA),
+# where we are interested to keep the top k neighbors.
+
+import numpy as np
+
+print('Sparsify a matrix by zeroing all elements but the top 2 values in a row.\n')
+
+A = np.array([[1,2,3,4,5],[9,8,6,4,5],[3,1,7,8,9]])
+
+print('Before:\n%s\n' %A)
+
+
+k = 2 # keep top k neighbors
+for row in A:
+ sort_idx = np.argsort(row)[::-1] # get indexes of sort order (high to low)
+ for i in sort_idx[k:]:
+ row[i]=0
+
+print('After:\n%s\n' %A)
+
+
+"""
+Sparsify a matrix by zeroing all elements but the top 2 values in a row.
+
+Before:
+[[1 2 3 4 5]
+ [9 8 6 4 5]
+ [3 1 7 8 9]]
+
+After:
+[[0 0 0 4 5]
+ [9 8 0 0 0]
+ [0 0 0 8 9]]
+
+"""
\ No newline at end of file