I was thinking about different ways to implement a function like sanitize(unsanitized_input, ignored_words) in Python.

I came up with a few implementations and noted the downsides of each, etc. Just a little snippet that others might benefit from.

Go comment on it with your own implementation if I missed some good ideas.

"""
Demonstration of ways to implement this API:
    sanitize(unsanitized_input, ignored_words)

Related discussions:
    - Modifying a list while looping over it:
        - http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python

    - Remove all occurences of a value in a list:
        - http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list
"""


# 1. Using set difference
def sanitize_1(unsanitized_input, ignored_words):
    """
    Take in two lists and santize first list to ensure it doesn't contain any
    items from the second list, return sanitized list
    """

    # Downsides:
    #   - Sets are unordered so if user_input was a sentence we lose the
    #     ordering because set difference will create a new set and not
    #     maintain ordering.

    return list(set(unsanitized_input) - set(ignored_words))


# 2. Using sets and instersection()
def sanitize_2(unsanitized_input, ignored_words):
    """
    Take in two lists and santize first list to ensure it doesn't contain any
    items from the second list, return sanitized list
    """

    # Downsides:
    #   - We must loop over words individually, which could be slower.
    #   - Doesn't handle if an ignore word is present more than once.

    ignored_words = set(ignored_words)
    for ignore in ignored_words.intersection(unsanitized_input):
        unsanitized_input.remove(ignore)

    return unsanitized_input


# 3. Using sets and instersection() with ability to remove ALL ignored words.
def sanitize_3(unsanitized_input, ignored_words):
    """
    Take in two lists and santize first list to ensure it doesn't contain any
    items from the second list, return sanitized list
    """
    # Downsides:
    #   - Looping over list while removing from it?
    #     http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python

    ignored_words = set(ignored_words)
    for ignore in ignored_words.intersection(unsanitized_input):
        while ignore in unsanitized_input:
            unsanitized_input.remove(ignore)

    return unsanitized_input


# 4. Using #3 with knowledge of list remove/mutable semantics considered
def sanitize_4(unsanitized_input, ignored_words):
    """
    Take in two lists and santize first list to ensure it doesn't contain any
    items from the second list, return sanitized list
    """
    # Downsides:
    #   - Looping over list while removing from it?
    #     http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python

    ignored_words = set(ignored_words)
    for ignore in ignored_words.intersection(unsanitized_input):
        while ignore in unsanitized_input:
            unsanitized_input.remove(ignore)

    # Note: The return is not technically necessary because remove() does an
    # in-place removal so the original list passed in is modified.


# 5. Using #4 without implicit side-effect of modifying list user passed in.
#    This demonstrates the functional paradigm of functions never modifiying
#    input parameters and always having the same output for the same input.
def sanitize_5(unsanitized_input, ignored_words):
    """
    Take in two lists and santize first list to ensure it doesn't contain any
    items from the second list, return sanitized list
    """

    # Downsides:
    #   - Requires extra memory usage because original unsanitized_input is
    #     copied.  Also, the final memory usage is increased in this version by
    #     the number of words/characters that are NOT in the ignored_words
    #     because those are removed.

    # Copy input list (various ways to accomplish, but list() syntax is more
    # clear IMO)
    unsanitized_input = list(unsanitized_input)
    # unsanitized_input[:] = unsanitized_input
    # import copy
    # unsanitized_input = copy(unsanitized_input)

    ignored_words = set(ignored_words)
    for ignore in ignored_words.intersection(unsanitized_input):
        while ignore in unsanitized_input:
            unsanitized_input.remove(ignore)

    # Return is required now b/c we didn't touch the original list, we have our
    # own local copy that is unsanitized
    return unsanitized_input


def check_results():
    user_input = 'the cat walked down the road.'.split()
    ignore_words = ['the', 'um', 'eh']

    print 'sanitize_1', sanitize_1(user_input, ignore_words)
    print 'sanitize_2', sanitize_2(user_input, ignore_words)
    print 'sanitize_3', sanitize_3(user_input, ignore_words)

    # Create new list so sanitize_4 gets fresh copy of words to ignore instead
    # of an already sanitized list (previous calls modified original
    # user_input)
    user_input = 'the cat walked down the road.'.split()
    sanitize_4(user_input, ignore_words)
    print 'sanitize_4', user_input

    # Start fresh again, previous call modified in-place again.
    user_input = 'the cat walked down the road.'.split()
    user_input_returned = sanitize_5(user_input, ignore_words)
    print 'sanitize_5', user_input_returned
    print '  original list', user_input, id(user_input)
    print '  sanitized list', user_input_returned, id(user_input_returned)


def check_performance():
    from timeit import timeit
    from functools import partial
    import sys

    def _sanitize_funcs():
        module = sys.modules[__name__]

        # Loop over them in order so we can print them as we 'improved' the
        # algorithm
        for obj in sorted(vars(module).values()):
            if callable(obj) and obj.__name__.startswith('sanitize_'):
                yield obj

    # This is needed to setup the local variables and have a context for
    # the function.
    ignore_words = ['the', 'eh' 'um', 'ugh']
    lots_of_input = 'the cat walked down the road.'
    user_input = lots_of_input.split()

    for func in _sanitize_funcs():
        print '%s %f secs' % (func.__name__,
                              timeit(partial(func, user_input, ignore_words)))


def main():
    print '----- Function Results -----'
    check_results()

    print '----- Function Performance -----'
    check_performance()


if __name__ == "__main__":
    main()

Join the conversation on github (3 comments)

I also included some code to check the performance of the functions. Obviously, they all perform fairly close to each other, but just something interesting to think about in your free time:

----- Function Performance -----
sanitize_1 1.100186 secs
sanitize_2 0.661218 secs
sanitize_3 0.664413 secs
sanitize_4 0.682067 secs
sanitize_5 0.983952 secs