Home ยป How can I control results returned by Python’s re.findall() on an html string?

How can I control results returned by Python’s re.findall() on an html string?

resultsArray = re.findall(r'<tag>(Catalina 320)</tag>’, string, re.DOTALL)

Blah blah blah
    <tag>**Catalina 320**</tag>
  Blah
    <td>**Catalina 320**</td>
  Blah Blah
    <tag>**These boats** are fully booked for the day</tag>
  Blah blah blah
    <tag>Catalina 320</tag>
    <tag>Catalina 320</tag>

re.findall(‘Catalina 320’, string.split(‘These boats’)[0])

import lxml.etree as ET
from lxml.etree import XMLParser

resultsArray = []
parser = XMLParser(ns_clean=True, recover=True)
tree = ET.parse(‘foo.html’, parser)   # See End-Note 2
for elem in tree.findall(“//”):
    if “These boats” in elem.text:
        break
    elif “Catalina 320” in elem.text:
        resultsArray.append(ET.tostring(elem).strip())

print resultsArray

[[email protected] ~]$ python foo.py
[‘<tag>**Catalina 320**</tag>’, ‘<td>**Catalina 320**</td>’]
[[email protected] ~]$

<body>
<tag>Blah blah blah</tag>
    <tag>**Catalina 320**</tag>
  <tag>Blah<tag>
    <td>**Catalina 320**</td>
  </tag>Blah Blah </tag>
    <tag>**These boats** are fully booked for the day</tag>
  <tag>Blah blah blah</tag>
    <tag>Catalina 320</tag>
    <tag>Catalina 320</tag>
    </body>

from lxml.html import soupparser
# …
try:
    parser = XMLParser(ns_clean=True, recover=True)
    tree = ET.parse(‘foo.html’, parser)
except UnicodeDecodeError:
    tree = soupparser.parse(‘foo.html’)

groups = re.findall(r'(Catalina 320)*.*These boats, r.read(), re.DOTALL)

<body>
<tag>Blah blah blah</tag>
    <tag>**Catalina 320**</tag>
  <tag>Blah<tag>
    <td>**Catalina 320**</td>
  </tag>Blah Blah </tag>
    <tag>**These boats** are fully booked for the day</tag>
  <tag>Blah blah blah</tag>
    <tag>Catalina 320</tag>
    <tag>Catalina 320</tag>
    </body>

from time import clock
n = 1000

########################################################################

import lxml.etree as ET
from lxml.etree import XMLParser

parser = XMLParser(ns_clean=True, recover=True)
etree = ET.parse(‘foo.html’, parser)

te = clock()
for i in xrange(n):
    resultsArray = []
    for thing in etree.findall(“//”):
        if “These boats” in thing.text:
            break
        elif “Catalina 320″in thing.text:
            resultsArray.append(ET.tostring(thing).strip())
tf = clock()

print ‘Solution with lxml’
print tf-te,’n’,resultsArray

########################################################################

with open(‘foo.html’) as f:
    text = f.read()

import re

print ‘nn———————————-‘
rigx = re.compile(‘(Catalina 320)(?:(?:.(?!Catalina 320))*These boats.*Z)?’,re.DOTALL)

te = clock()
for i in xrange(n):
    yi = rigx.findall(text)
tf = clock()

print ‘Solution 1 with a regex’
print tf-te,’n’,yi

print ‘n———————————-‘

ragx = re.compile(‘(Catalina 320)|(These boats)’)

te = clock()
for i in xrange(n):
    li = []
    for mat in ragx.finditer(text):
        if mat.group(2):
            break
        else:
            li.append(mat.group(1))
tf = clock()

print ‘Solution 2 with a regex, similar to solution with lxml’
print tf-te,’n’,li

print ‘n———————————-‘

regx = re.compile(‘(Catalina 320)’)

te = clock()
for i in xrange(n):
    ye = regx.findall(text, 0, text.find(‘These boats’) if ‘These boats’ in text else len(text))
tf = clock()

print ‘Solution 3 with a regex’
print tf-te,’n’,ye

Solution with lxml
0.30324105438
[‘<tag>**Catalina 320**</tag>’, ‘<td>**Catalina 320**</td>’]

———————————-
Solution 1 with regex
0.0245033935877
[‘Catalina 320’, ‘Catalina 320’]

———————————-
Solution 2 with a regex, similar to solution with lxml
0.0233258696287
[‘Catalina 320’, ‘Catalina 320’]

———————————-
Solution 3 with regex
0.00784708671074
[‘Catalina 320’, ‘Catalina 320’]

rigx = re.compile(‘(<tag>Catalina 320</tag>)(?:(?:.(?!<tag>Catalina 320</tag>))*These boats.*Z)?|These boats.*Z’,re.DOTALL)

Add Comment

Click here to post a comment

Your email address will not be published. Required fields are marked *