Subject: Re: pattern-based string expansion - DN [1]


nickm@mit.edu (Nick Mathewson) - 20 Apr 2000 - comp.lang.python

 On Thu, 20 Apr 2000 02:51:50 GMT, sue <smalleys@gte.net> wrote:
 >I'm not looking for all possible strings with a wildcard,
 >just a pseudo-RE-based expansion.  The RE would not support
 >open-ended arguments (* or +), but would have ?,[] and so
 >on.
 >The expansion needs to be ordered.
 >
 >I've thought about hacking the glob(3) code to work on an
 >input RE, but I was hoping this already existed somewhere.

 Well, the code below will probably only be useful to a hacker, but if
 you're prepared to hack glob(3), this should be a snap.

 It's a very very simplified version of (a subset of (a more general
 regex-manipulation module for (a project that I'm working on))).  I
 didn't need an expansion function for my project, but it was pretty
 easy to hack one up.

 ============================================================
 #!/usr/bin/python

 import string

 def expand_re(s):
     '''Given a regular expression, returns all a list of all possible
        expansions.  It supports (), |, [], ?, \[dwsDWS], and '.'.
        The backslash character escapes any other character.

        Example:
            expand_re('spam( spam( spam[!?]?)?)?|Bloody Vikings!')
               => [ 'spam spam spam!',
                    'spam spam spam?',
                    'spam spam spam',
                'spam spam',
            'spam',
            'Bloody Vikings!' ]

        BUG: Only mildly tested.
        BUG: Doesn't catch bad regular expressions, or give good error
             messages.
        BUG: Isn't very fast or efficient.
        BUG: Can't handle character escapes within [].
        BUG: Can't handle [^...].
        BUG: Doesn't support unicode.
        BUG: Doesn't support case-insensitive matching.
        BUG: Always assumes that . should match newline.
        BUG: Doesn't support +, *, {n,m}.  In order to do these, we'd need
             a function to only consider expansions up to a certain length,
             or to only return the first n expansions.
        '''

     re, pos = _parseRE(s,0)
     return re.get_expansions()

 def _parseRE(s, pos):
     '''Does the dirty work of parsing a regular expression from string 's',
        starting at position 'pos'.  Returns the Seq/Alt/Opt/Charset object
        corresponding to s, along with the position immediately following

        returns element, end tuple'''

     # First, we find all the elements in s.  Elements are:
     #  (groups)
     #  [character sets]     (also '.', \d, and friends.)
     #  characters
     #  optional-elements?
     #  |
     elements = []
     while pos < len(s):
     ch = s[pos]
     if ch == '(':
         group, pos = _parseRE(s, pos+1)
         pos = pos-1
         elements.append(group)
     elif ch == ')':
         pos = pos + 1
         break
     elif ch == '\\':
         next = _special.get(s[pos+1], None)
         if next:
         elements.append(next)
         else:
         elements.append(s[pos+1])
         pos = pos + 1
     elif ch == '[':
         chrs = []
         while 1:
         pos = pos + 1
         ch = s[pos]
         if ch == ']':
             break
         chrs.append(ch)
         elements.append(CharSet(string.join(chrs,'')))
     elif ch == '?':
         elements[-1] = Alt([elements[-1], ''])
     elif ch == '|':
         elements.append(None) #HACKHACK
     elif ch == '.':
         elements.append(_allChars)
     else:
         elements.append(ch)
     pos = pos + 1

     # Now we compact all adjacent characters into strings.  This will help
     # our efficiency in the end, I think.  The revised list is stored in
     # 'els'.
     els = []
     cur_str = None
     for el in elements:
     if type(el) == type(''):
         if cur_str is not None:
         cur_str = cur_str + el
         else:
         cur_str = el
     else:
         if cur_str is not None:
         els.append(cur_str)
         cur_str = None
         els.append(el)
     if cur_str:
     els.append(cur_str)

     # Now we go through the list to look for vertical bars -- encoded as
     # None.  We find all alternatives within the current group.
     groups = []
     cur_group = []
     for el in els:
     if el is None:
         groups.append(cur_group)
         cur_group = []
     else:
         cur_group.append(el)
     groups.append(cur_group)

     # If there's more than one alternative, return an Alt object.  Otherwise,
     # a Seq.
     if len(groups) == 1:
     return Seq(groups[0]), pos
     else:
     return Alt(map(Seq,groups)), pos

 class Seq:
     '''This class represents a series of regular expression elements, one
        following another'''
     def __init__(self, members):
     self.members = members

     def get_expansions(self):
     member_expansions = map(_expand, self.members)
     return _joinings(member_expansions)

 class Alt:
     '''This class represents a choice between two or more elements.'''
     def __init__(self, alternatives):
     self.alternatives = alternatives

     def get_expansions(self):
     member_expansions = map(_expand, self.alternatives)
     ex = []
     for el in member_expansions:
         # This is a hack; I'd rather say 'ex.extend(el)', but not
             # everyone is using Python >=1.5.2
         ex[len(ex):len(ex)] = el
     return ex

 class CharSet:
     '''This class represents a group of possible characters,
        such as [abc] or \d' '''
     def __init__(self, characters):
     self.characters = list(characters)

     def get_expansions(self):
     return self.characters

 def _expand(element):
     '''Returns the expansion of an element.'''
     if type(element) == type(''):
     return [ element ]
     else:
     return element.get_expansions()

 def _joinings(lst):
     ''' Takes a list of lists of strings, such as [ ['a1','a2'], ['b1, b2']].
         Returns a new list consisting of the joins of one member from list1,
         one from list2, and so forth. (Such as ['a1b1','a1b2','a2b1','a1b2'])
     '''
     prefixes = ['']
     for sublist in lst:
     next_prefixes = []
     if len(sublist) == 0:
         return []
     for prefix in prefixes:
         for el in sublist:
         next_prefixes.append(prefix + el)
     prefixes = next_prefixes
     return prefixes

 _allChars = CharSet(string.join(map(chr,range(0,255)),''))
 _special = {
              't' : '\t',
          'n' : '\n',
          'r' : '\r',
          'f' : '\f',
          'e' : '\e',
              's' : CharSet(' \t\n\f\r'),
          'd' : CharSet(string.digits),
          'w' : CharSet(string.letters + string.digits + "_"),
          'W':  CharSet(map(chr, [32,96] + range(0,48) + range(58,65) +
                    range(91,95) + range(123,256))),
          'S':  CharSet(map(chr, [65,11]+range(0,8)+range(14,32)+
                    range(33,92)+
                    range(93,256))),
          'D':  CharSet(map(chr, [32]+range(0,48)+range(58,256)))
          }

 if __name__ == '__main__':
     import sys
     expr = sys.argv[1]
     for el in expand_re(expr):
     print el
 ============================================================

 Was that about what you had in mind?

 Delurkingly-yrs
 --
 Nick Mathewson     <nickm@mit.edu>     http://www.mit.edu/~nickm/

Last modified
2000-07-20

(195.108.246.52)

Note: you are looking at
the snapshot of an old wiki
- much of this information
is likely to be very outdated