In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', '..', 'notebook_format'))

from formats import load_style
load_style(plot_style=False)
Out[1]:
In [2]:
os.chdir(path)

# magic to print version
%load_ext watermark
%watermark -a 'Ethen' -d -t -v
Ethen 2017-10-31 17:04:05 

CPython 3.5.2
IPython 6.2.1

Strings and Text

Some of the materials are a condensed reimplementation from the resource: Python3 Cookbook Chapter 2. Strings and Text, which originally was freely available online.

Splitting Strings on Any of Multiple Delimiters Using re.split

The separator is either a semicolon (;), a comma (,), a whitespace ( ) or multiple whitespace.

In [3]:
import re
line = 'asdf fjdk; afed, fjek,asdf,      foo'

re.split(r'[;,\s]\s*', line)
Out[3]:
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

Matching Text at the Start or End of a String

Use the str.startswith() or str.endswith().

In [4]:
filenames = ['Makefile', 'foo.c', 'bar.py', 'spam.c', 'spam.h']

# pass in a tuple for multiple match, must be tuple, list won't work
print([name for name in filenames if name.endswith(('.c', '.h'))])

print(any(name.endswith('.py') for name in filenames))
['foo.c', 'spam.c', 'spam.h']
True

Wildcard Patterns Way of Matching Strings Using fnmatchcase

In [5]:
from fnmatch import fnmatchcase

addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK st',
    '4802 N BROADWAY']

[addr for addr in addresses if fnmatchcase(addr, '* ST')]
Out[5]:
['5412 N CLARK ST', '1060 W ADDISON ST']

Matching and Searching for Text Patterns

Example1: Finding the position of a simple first match using str.find().

In [6]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.find('no')
Out[6]:
10

Example2: Match a lot of the same complex pattern, it's better to precompile the regular expression pattern first using re.compile().

In [7]:
import re

text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
 
# Simple matching: \d+ means match one or more digits
# the 'r' simply means raw strings, this leaves the backslash (\)
# uninterpretted, or else you'll have to use \\ to match special characters
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')

if re.match(r'\d+/\d+/\d+', text2):
    print('yes')
else:
    print('no')


# the re.compile version
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')

if datepat.match(text2):
    print('yes')
else:
    print('no')
yes
no
yes
no

Example3: Find all occurences in the text instead of just the first one with findall().

In [8]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)
Out[8]:
['11/27/2012', '3/13/2013']

Example4: Capture groups by enclosing the pattern in parathensis.

In [9]:
# single match
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.groups())
print(m.group(1))
('11', '27', '2012')
11
In [10]:
# mutiple match
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

print(datepat.findall(text))
print(re.findall(r'(\d+)/(\d+)/(\d+)', text))  # for matching just once

for month, day, year in datepat.findall(text):
    print('{}-{}-{}'.format(year, month, day))
[('11', '27', '2012'), ('3', '13', '2013')]
[('11', '27', '2012'), ('3', '13', '2013')]
2012-11-27
2013-3-13
In [11]:
# return a iterator instead of a list
for m in datepat.finditer(text):
    print(m.groups())
('11', '27', '2012')
('3', '13', '2013')

Searching and Replacing Text

Example1: Finding the position of a simple first match using str.replace().

In [12]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')
Out[12]:
'yep, but no, but yep, but no, but yep'

Example2: More complex replace using re.sub(). The nackslashed digits refers to the matched group.

In [13]:
import re

# replace date from d/m/Y to Y-m-d
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
Out[13]:
'Today is 2012-11-27. PyCon starts 2013-3-13.'

Example3: Define a function for the substitution.

In [14]:
import re
from calendar import month_abbr


def change_date(m):
    # place in the matched pattern and return the replaced text
    mon_name = month_abbr[ int(m.group(1)) ]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))


datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(change_date, text)
Out[14]:
'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

Example4: Use .subn() to replace and return the number of substitution made.

In [15]:
newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext)
print(n)
Today is 2012-11-27. PyCon starts 2013-3-13.
2

Example5: supply the re.IGNORECASE flag if you want to ignore cases.

In [16]:
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags = re.IGNORECASE)
Out[16]:
['PYTHON', 'python', 'Python']

Stripping Unwanted Characters from Strings Using strip

For unwanted characters in the beginning and end of the string, use str.strip(). And there's str.lstrip() and str.rstrip() for left and right stripping.

In [17]:
# white space stripping
s = '   hello world  \n'
print(s.strip())

# character stripping
t = '-----hello world====='
print(t.strip('-='))
hello world
hello world
In [18]:
"""
with open(filename) as f:
    lines = (line.strip() for line in f)
    for line in lines:
"""
print('Generator Expression can be useful when you want to perform other operations after stripping')
Generator Expression can be useful when you want to perform other operations after stripping

Character to Character Mapping Using translate.

Boiler plate: The method str.translate() returns a copy of the string in which all characters have been translated using a preconstructed table using the str.maketrans() function.

In [19]:
intab = 'aeiou'
outtab = '12345'

# maps the character a > 1, e > 2
trantab = str.maketrans(intab, outtab)

str = 'this is string example....wow!!!'
print(str.translate(trantab))
th3s 3s str3ng 2x1mpl2....w4w!!!

Combining and Concatenating Strings

Example1: Use .join() when the strings you wish to combine are in a sequence.

In [20]:
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))
Is Chicago Not Chicago?

Example2: Don't use the + operator when unneccessary.

In [21]:
a = 'Is Chicago'
b = 'Not Chicago?'
print(a + ' ' + b)
print(a, b, sep = ' ')
Is Chicago Not Chicago?
Is Chicago Not Chicago?

String Formatting

In [22]:
s = '{name} has {n} messages.'
s.format(name = 'Guido', n = 37)
Out[22]:
'Guido has 37 messages.'

Reformatting Text to a Fixed Number of Columns Using textwrap

In [23]:
import os
import textwrap

s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

print(textwrap.fill(s, 40))

# if you want to get the text to match the terminal size
print(os.get_terminal_size().columns)
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
90