Software Carpentry logo

Regular Expressions

April 24, 2010: We are pleased to announce that Version 4 of this course is now under development. For updates and an early peek at the content, please check out the Software Carpentry blog at http://www.software-carpentry.org/blog/.

1) Introduction

2) You Can Skip This Lecture If...

3) A Simple Example

import re

dragons = [
    ['CTAGGTGTACTGATG',    'Antipodean Opaleye'],
    ['AAGATGCGTCCGTAT',    'Common Welsh Green'],
    ['AGTCGTGCTCGTTATATC', 'Hebridean Black'],
    ['ATGCGTCGTCGATTATCT', 'Hungarian Horntail'],
    ['CCGTTAGGGCTAAATGCT', 'Norwegian Ridgeback']
]

for (dna, name) in dragons:
    if re.search('ATGCGT', dna):
        print name
Common Welsh Green
Hungarian Horntail

4) This or That

import re

dragons = [
    ['CTAGGTGTACTGATG',    'Antipodean Opaleye'],
    ['AAGATGCGTCCGTAT',    'Common Welsh Green'],
    ['AGTCGTGCTCGTTATATC', 'Hebridean Black'],
    ['ATGCGTCGTCGATTATCT', 'Hungarian Horntail'],
    ['CCGTTAGGGCTAAATGCT', 'Norwegian Ridgeback']
]

for (dna, name) in dragons:
    if re.search('ATGCGT|GCT', dna):
        print name
Common Welsh Green
Hebridean Black
Hungarian Horntail
Norwegian Ridgeback

5) Precedence

import re

tests = [
    ['ATA',   True],
    ['xATCx', True],
    ['ATG',   False],
    ['AT',    False],
    ['ATAC',  True]
]

for (dna, expected) in tests:
    actual = re.search('AT(A|C)', dna) is not None
    assert actual == expected

6) Escaping Special Characters

7) Raw Strings

8) Sequences

tests = [
    ['TTACTA',    True],  # separated by zero G's
    ['TTAGCTA',   True],  # separated by one G
    ['TTAGGGCTA', True],  # separated by three G's
    ['TTAXCTA',   False], # an X in the way
    ['TTAGCGCTA', False], # an embedded X in the way
]

for (dna, expected) in tests:
    actual = re.search('TTAG*CTA', dna) is not None
    assert actual == expected
Zero or More

Figure 18.2: Zero or More

assert re.search('TTAG*CTA', 'TTACTA')
assert not re.search('TTAG+CTA', 'TTACTA')
One or More

Figure 18.3: One or More

9) Making Something Optional

assert re.search('AC?T', 'AT')
assert re.search('AC?T', 'ACT')
assert not re.search('AC?T', 'ACCT')
Zero or One

Figure 18.4: Zero or One

10) Character Sets

import re

lines = [
    "Charles Darwin (1809-82)",
    "Darwin's principal works, The Origin of Species (1859)",
    "and The Descent of Man (1871) marked a new epoch in our",
    "understanding of our world and ourselves.  His ideas",
    "were shaped by the Beagle's voyage around the world in",
    "1831-36."
]

for line in lines:
    if re.search('[0-9]+', line):
        print line
Charles Darwin (1809-82)
Darwin's principal works, The Origin of Species (1859)
and The Descent of Man (1871) marked a new epoch in our
1831-36.

11) Abbreviations

12) Special Cases

import re

words = '''Born in New York City in 1918, Richard Feynman earned a
bachelor's degree at MIT in 1939, and a doctorate from Princeton in
1942. After working on the Manhattan Project in Los Alamos during
World War II, he became a professor at CalTech in 1951.  Feynman won
the 1965 Nobel Prize in Physics for his work on quantum
electrodynamics, and served on the commission investigating the
Challenger disaster in 1986.'''.split()

end_in_vowel = set()
for w in words:
    if re.search(r'[aeiou]\b', w):
        end_in_vowel.add(w)
for w in end_in_vowel:
    print w
a
Prize
degree
became
doctorate
the
he

13) Anchoring

14) Extracting Matches

import sys, re

lines = '''Date: 2006-03-07
On duty: HP # 01:30 - 03:00
Observed: Common Welsh Green
On duty: RW #03:00-04:30
Observed: none
On duty: HG # 04:30-06:00
Observed: Hebridean Black
'''.split('\n')

for line in lines:
    if re.search('#', line):
        comment = line.split('#')[1]
        print comment
 01:30 - 03:00
03:00-04:30
 04:30-06:00

15) Match Objects

import re

text = 'abbcb'
for pattern in ['b+', 'bc*', 'b+c+']:
    match = re.search(pattern, text)
    print '%s / %s => "%s" (%d, %d)' % \
          (pattern, text, match.group(), match.start(), match.end())
b+ / abbcb => "bb" (1, 3)
bc* / abbcb => "b" (1, 2)
b+c+ / abbcb => "bbc" (1, 4)

16) Match Groups

import sys, re

lines = '''Date: 2006-03-07
On duty: HP # 01:30 - 03:00
Observed: Common Welsh Green
On duty: RW #03:00-04:30
Observed: none
On duty: HG # 04:30-06:00
Observed: Hebridean Black
'''.split('\n')

for line in lines:
    match = re.search(r'#\s*(.+)', line)
    if match:
        comment = match.group(1)
        print comment
01:30 - 03:00
03:00-04:30
04:30-06:00

17) Reversing Columns

import re

def reverse_columns(line):
    match = re.search(r'^\s*(\d+)\s+(\d+)\s*$', line)
    if not match:
        return line
    return match.group(2) + ' ' + match.group(1)

tests = [
    ['10 20',    'easy case'],
    [' 30  40 ', 'padding'],
    ['60 70 80', 'too many columns'],
    ['90 end',   'non-numeric']
]

for (fixture, title) in tests:
    actual = reverse_columns(fixture)
    print '%s: "%s" => "%s"' % (title, fixture, actual)
easy case: "10 20" => "20 10"
padding: " 30  40 " => "40 30"
too many columns: "60 70 80" => "60 70 80"
non-numeric: "90 end" => "90 end"

18) Compiling

19) Finding Title Case Words

import re

# Put pattern outside 'find_all' so that it's only compiled once.
pattern = re.compile(r'\b([A-Z][a-z]*)\b(.*)')

def find_all(line):
    result = []
    match = pattern.search(line)
    while match:
        result.append(match.group(1))
        match = pattern.search(match.group(2))
    return result

lines = [
    'This has several Title Case words',
    'on Each Line (Some in parentheses).'
]
for line in lines:
    print line
    for word in find_all(line):
        print '\t', word
This has several Title Case words
	This
	Title
	Case
on Each Line (Some in parentheses).
	Each
	Line
	Some

20) Finding All Matches

import re

lines = [
    'This has several Title Case words',
    'on Each Line (Some in parentheses).'
]
pattern = re.compile(r'\b([A-Z][a-z]*)\b')
for line in lines:
    print line
    for word in pattern.findall(line):
        print '\t', word
This has several Title Case words
	This
	Title
	Case
on Each Line (Some in parentheses).
	Each
	Line
	Some

21) Reference Material

Pattern Matches Doesn't Match Explanation
a* "", "a", "aa", ... "A", "b" * means "zero or more" matching is case sensitive
b+ "b", "bb", ... "" + means "one or more"
ab?c "ac", "abc" "a", "abbc" ? means "optional" (zero or one)
[abc] "a", "b", or "c" "ab", "d" [...] means "one character from a set"
[a-c] "a", "b", or "c" Character ranges can be abbreviated
[abc]* "", "ac", "baabcab", ... Operators can be combined: zero or more choices from "a", "b", or "c"

Table 18.3: Regular Expression Operators

Method Purpose Example Result
split Split a string on a pattern. re.split('\\s*,\\s*', 'a, b ,c , d') ['a', 'b', 'c', 'd']
findall Find all matches for a pattern. re.findall('\\b[A-Z][a-z]*', 'Some words in Title Case.') ['Some', 'Title', 'Case']
sub Replace matches with new text. re.sub('\\d+', 'NUM', 'If 123 is 456') "If NUM is NUM"

Table 18.4: Regular Expression Object Methods

22) But Wait, There's More

23) Summary