# Experiments with regular expressions¶

In [1]:
import re

In [2]:
re.findall('is','This is an arbitrary sentence.')

Out[2]:
['is', 'is']

Special characters ^.\$+-

. matches any single character except newline

In [ ]:
re.findall('..is.','This is an arbitrary sentence.')

In [ ]:
re.findall('w.{1,3}s','This is an arbitrary sentence which is full of a lot more words.')

In [ ]:
re.findall('w.{1,6}s','This is an arbitrary sentence which is full of a lot more words.')

In [ ]:
+ for repetitions

In [ ]:
re.findall('.el+','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
re.findall('.el+s','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
re.findall('e.+s','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
Matching is "greedy" by default.

In [ ]:
? changes it to "lazy"

In [ ]:
re.findall('e.+?s','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
[] used to enclose options

In [ ]:
re.findall('.e[ls].','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
re.findall('.e[a-z].','She sells sea shells by the sea shore. She likes to sell them to elfs.')

In [ ]:
emailmatcher = '[a-z]+@[a-z]+\.[a-z]+'

In [ ]:
emailmatcher = '[a-z]+@[a-z]+\.[a-z]+'

In [ ]:
emailmatcher = '[a-z,A-Z,0-9]+@[a-z]+\.[a-z]+'

In [ ]:
emailmatcher = '[a-z,A-Z,0-9,_,\.]+@[a-z]+\.[a-z]+'

In [ ]:
The asterisk *

In [ ]:
s = 'clear color'
myre = 'c[aeiou]*l'
re.findall(myre,s)

In [ ]:
\b for whitespace

In [ ]:
s = 'teach the teacher'
myre = '\\bteach\\b'
myre
re.findall(myre,s)

marked groups
In [ ]:
s = 'blah <a href=http://foo.com/blah.html'
myre = 'http://.+\.html'
re.findall(myre,s)

In [ ]:
s = 'blah <a href=http://foo.com/blah.html'
myre = 'http://(.+\.html)'
re.findall(myre,s)


For heavy-duty applications, faster if we pre-compile the regex:

In [ ]:
myre = re.compile('http://(.+\.html)')
s = 'blah <a href=http://foo.com/blah.html'
re.findall(myre,s)

In [ ]:
myre = '[A-Z].., [A-Z].. [0-3]?[0-9], [0-9]{4}'
s = 'Today is Tue, Feb 14, 2017, I think.'
re.findall(myre,s)

In [ ]:
s = 'Today is Nod, Boo 14, 2017, I think.'
re.findall(myre,s)

In [ ]:
myre = 'Mon|Tue'
s = 'Today is Tue, Feb 14, 2017, I think. Or is it Monday?'
re.findall(myre,s)

In [ ]:
myre = '\\bMon\\b|\\bTue\\b'
s = 'Today is Tue, Feb 14, 2017, I think. Or is it Monday?'
re.findall(myre,s)

In [ ]:
myre = '(?:Mon|Tue),'  # non-capturing group
s = 'Today is Tue, Feb 14, 2017, I think. Or is it Monday?'
re.findall(myre,s)

In [ ]: