mirror of
https://github.com/trustedsec/hate_crack.git
synced 2025-12-12 07:40:27 -08:00
537 lines
18 KiB
Python
537 lines
18 KiB
Python
# pyenchant
|
|
#
|
|
# Copyright (C) 2004-2009, Ryan Kelly
|
|
#
|
|
# This library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# This library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with this library; if not, write to the
|
|
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
# Boston, MA 02111-1307, USA.
|
|
#
|
|
# In addition, as a special exception, you are
|
|
# given permission to link the code of this program with
|
|
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
|
# spell checker backend) and distribute linked combinations including
|
|
# the two. You must obey the GNU Lesser General Public License in all
|
|
# respects for all of the code used other than said providers. If you modify
|
|
# this file, you may extend this exception to your version of the
|
|
# file, but you are not obligated to do so. If you do not wish to
|
|
# do so, delete this exception statement from your version.
|
|
#
|
|
"""
|
|
|
|
enchant.tokenize: String tokenization functions for PyEnchant
|
|
================================================================
|
|
|
|
An important task in spellchecking is breaking up large bodies of
|
|
text into their constituent words, each of which is then checked
|
|
for correctness. This package provides Python functions to split
|
|
strings into words according to the rules of a particular language.
|
|
|
|
Each tokenization function accepts a string as its only positional
|
|
argument, and returns an iterator that yields tuples of the following
|
|
form, one for each word found::
|
|
|
|
(<word>,<pos>)
|
|
|
|
The meanings of these fields should be clear: <word> is the word
|
|
that was found and <pos> is the position within the text at which
|
|
the word began (zero indexed, of course). The function will work
|
|
on any string-like object that supports array-slicing; in particular
|
|
character-array objects from the 'array' module may be used.
|
|
|
|
The iterator also provides the attribute 'offset' which gives the current
|
|
position of the tokenizer inside the string being split, and the method
|
|
'set_offset' for manually adjusting this position. This can be used for
|
|
example if the string's contents have changed during the tokenization
|
|
process.
|
|
|
|
To obtain an appropriate tokenization function for the language
|
|
identified by <tag>, use the function 'get_tokenizer(tag)'::
|
|
|
|
tknzr = get_tokenizer("en_US")
|
|
for (word,pos) in tknzr("text to be tokenized goes here")
|
|
do_something(word)
|
|
|
|
This library is designed to be easily extendible by third-party
|
|
authors. To register a tokenization function for the language
|
|
<tag>, implement it as the function 'tokenize' within the
|
|
module enchant.tokenize.<tag>. The 'get_tokenizer' function
|
|
will automatically detect it. Note that the underscore must be
|
|
used as the tag component separator in this case, in order to
|
|
form a valid python module name. (e.g. "en_US" rather than "en-US")
|
|
|
|
Currently, a tokenizer has only been implemented for the English
|
|
language. Based on the author's limited experience, this should
|
|
be at least partially suitable for other languages.
|
|
|
|
This module also provides various implementations of "Chunkers" and
|
|
"Filters". These classes are designed to make it easy to work with
|
|
text in a vareity of common formats, by detecting and excluding parts
|
|
of the text that don't need to be checked.
|
|
|
|
A Chunker is a class designed to break a body of text into large chunks
|
|
of checkable content; for example the HTMLChunker class extracts the
|
|
text content from all HTML tags but excludes the tags themselves.
|
|
A Filter is a class designed to skip individual words during the checking
|
|
process; for example the URLFilter class skips over any words that
|
|
have the format of a URL.
|
|
|
|
For exmaple, to spellcheck an HTML document it is necessary to split the
|
|
text into chunks based on HTML tags, and to filter out common word forms
|
|
such as URLs and WikiWords. This would look something like the following::
|
|
|
|
tknzr = get_tokenier("en_US",(HTMLChunker,),(URLFilter,WikiWordFilter)))
|
|
|
|
text = "<html><body>the url is http://example.com</body></html>"
|
|
for (word,pos) in tknzer(text):
|
|
...check each word and react accordingly...
|
|
|
|
"""
|
|
_DOC_ERRORS = ["pos", "pos", "tknzr", "URLFilter", "WikiWordFilter",
|
|
"tkns", "tknzr", "pos", "tkns"]
|
|
|
|
import re
|
|
import warnings
|
|
|
|
import enchant
|
|
from enchant.utils import next, xrange
|
|
from enchant.errors import *
|
|
|
|
# For backwards-compatability. This will eventually be removed, but how
|
|
# does one mark a module-level constant as deprecated?
|
|
Error = TokenizerNotFoundError
|
|
|
|
|
|
class tokenize:
|
|
"""Base class for all tokenizer objects.
|
|
|
|
Each tokenizer must be an iterator and provide the 'offset'
|
|
attribute as described in the documentation for this module.
|
|
|
|
While tokenizers are in fact classes, they should be treated
|
|
like functions, and so are named using lower_case rather than
|
|
the CamelCase more traditional of class names.
|
|
"""
|
|
_DOC_ERRORS = ["CamelCase"]
|
|
|
|
def __init__(self, text):
|
|
self._text = text
|
|
self._offset = 0
|
|
|
|
def __next__(self):
|
|
return self.next()
|
|
|
|
def next(self):
|
|
raise NotImplementedError()
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def set_offset(self, offset, replaced=False):
|
|
self._offset = offset
|
|
|
|
def _get_offset(self):
|
|
return self._offset
|
|
|
|
def _set_offset(self, offset):
|
|
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
|
|
" use the 'set_offset' method"
|
|
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
self.set_offset(offset)
|
|
|
|
offset = property(_get_offset, _set_offset)
|
|
|
|
|
|
def get_tokenizer(tag=None, chunkers=None, filters=None):
|
|
"""Locate an appropriate tokenizer by language tag.
|
|
|
|
This requires importing the function 'tokenize' from an appropriate
|
|
module. Modules tried are named after the language tag, tried in the
|
|
following order:
|
|
* the entire tag (e.g. "en_AU.py")
|
|
* the base country code of the tag (e.g. "en.py")
|
|
|
|
If the language tag is None, a default tokenizer (actually the English
|
|
one) is returned. It's unicode aware and should work OK for most
|
|
latin-derived languages.
|
|
|
|
If a suitable function cannot be found, raises TokenizerNotFoundError.
|
|
|
|
If given and not None, 'chunkers' and 'filters' must be lists of chunker
|
|
classes and filter classes resectively. These will be applied to the
|
|
tokenizer during creation.
|
|
"""
|
|
if tag is None:
|
|
tag = "en"
|
|
# "filters" used to be the second argument. Try to catch cases
|
|
# where it is given positionally and issue a DeprecationWarning.
|
|
if chunkers is not None and filters is None:
|
|
chunkers = list(chunkers)
|
|
if chunkers:
|
|
try:
|
|
chunkers_are_filters = issubclass(chunkers[0], Filter)
|
|
except TypeError:
|
|
pass
|
|
else:
|
|
if chunkers_are_filters:
|
|
msg = "passing 'filters' as a non-keyword argument " \
|
|
"to get_tokenizer() is deprecated"
|
|
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
filters = chunkers
|
|
chunkers = None
|
|
# Ensure only '_' used as separator
|
|
tag = tag.replace("-", "_")
|
|
# First try the whole tag
|
|
tkFunc = _try_tokenizer(tag)
|
|
if tkFunc is None:
|
|
# Try just the base
|
|
base = tag.split("_")[0]
|
|
tkFunc = _try_tokenizer(base)
|
|
if tkFunc is None:
|
|
msg = "No tokenizer found for language '%s'" % (tag,)
|
|
raise TokenizerNotFoundError(msg)
|
|
# Given the language-specific tokenizer, we now build up the
|
|
# end result as follows:
|
|
# * chunk the text using any given chunkers in turn
|
|
# * begin with basic whitespace tokenization
|
|
# * apply each of the given filters in turn
|
|
# * apply language-specific rules
|
|
tokenizer = basic_tokenize
|
|
if chunkers is not None:
|
|
chunkers = list(chunkers)
|
|
for i in xrange(len(chunkers) - 1, -1, -1):
|
|
tokenizer = wrap_tokenizer(chunkers[i], tokenizer)
|
|
if filters is not None:
|
|
for f in filters:
|
|
tokenizer = f(tokenizer)
|
|
tokenizer = wrap_tokenizer(tokenizer, tkFunc)
|
|
return tokenizer
|
|
|
|
|
|
get_tokenizer._DOC_ERRORS = ["py", "py"]
|
|
|
|
|
|
class empty_tokenize(tokenize):
|
|
"""Tokenizer class that yields no elements."""
|
|
_DOC_ERRORS = []
|
|
|
|
def __init__(self):
|
|
tokenize.__init__(self, "")
|
|
|
|
def next(self):
|
|
raise StopIteration()
|
|
|
|
|
|
class unit_tokenize(tokenize):
|
|
"""Tokenizer class that yields the text as a single token."""
|
|
_DOC_ERRORS = []
|
|
|
|
def __init__(self, text):
|
|
tokenize.__init__(self, text)
|
|
self._done = False
|
|
|
|
def next(self):
|
|
if self._done:
|
|
raise StopIteration()
|
|
self._done = True
|
|
return (self._text, 0)
|
|
|
|
|
|
class basic_tokenize(tokenize):
|
|
"""Tokenizer class that performs very basic word-finding.
|
|
|
|
This tokenizer does the most basic thing that could work - it splits
|
|
text into words based on whitespace boundaries, and removes basic
|
|
punctuation symbols from the start and end of each word.
|
|
"""
|
|
_DOC_ERRORS = []
|
|
|
|
# Chars to remove from start/end of words
|
|
strip_from_start = '"' + "'`(["
|
|
strip_from_end = '"' + "'`]).!,?;:"
|
|
|
|
def next(self):
|
|
text = self._text
|
|
offset = self._offset
|
|
while True:
|
|
if offset >= len(text):
|
|
break
|
|
# Find start of next word
|
|
while offset < len(text) and text[offset].isspace():
|
|
offset += 1
|
|
sPos = offset
|
|
# Find end of word
|
|
while offset < len(text) and not text[offset].isspace():
|
|
offset += 1
|
|
ePos = offset
|
|
self._offset = offset
|
|
# Strip chars from font/end of word
|
|
while sPos < len(text) and text[sPos] in self.strip_from_start:
|
|
sPos += 1
|
|
while 0 < ePos and text[ePos - 1] in self.strip_from_end:
|
|
ePos -= 1
|
|
# Return if word isnt empty
|
|
if (sPos < ePos):
|
|
return (text[sPos:ePos], sPos)
|
|
raise StopIteration()
|
|
|
|
|
|
def _try_tokenizer(modName):
|
|
"""Look for a tokenizer in the named module.
|
|
|
|
Returns the function if found, None otherwise.
|
|
"""
|
|
modBase = "enchant.tokenize."
|
|
funcName = "tokenize"
|
|
modName = modBase + modName
|
|
try:
|
|
mod = __import__(modName, globals(), {}, funcName)
|
|
return getattr(mod, funcName)
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
def wrap_tokenizer(tk1, tk2):
|
|
"""Wrap one tokenizer inside another.
|
|
|
|
This function takes two tokenizer functions 'tk1' and 'tk2',
|
|
and returns a new tokenizer function that passes the output
|
|
of tk1 through tk2 before yielding it to the calling code.
|
|
"""
|
|
# This logic is already implemented in the Filter class.
|
|
# We simply use tk2 as the _split() method for a filter
|
|
# around tk1.
|
|
tkW = Filter(tk1)
|
|
tkW._split = tk2
|
|
return tkW
|
|
|
|
|
|
wrap_tokenizer._DOC_ERRORS = ["tk", "tk", "tk", "tk"]
|
|
|
|
|
|
class Chunker(tokenize):
|
|
"""Base class for text chunking functions.
|
|
|
|
A chunker is designed to chunk text into large blocks of tokens. It
|
|
has the same interface as a tokenizer but is for a different purpose.
|
|
"""
|
|
pass
|
|
|
|
|
|
class Filter(object):
|
|
"""Base class for token filtering functions.
|
|
|
|
A filter is designed to wrap a tokenizer (or another filter) and do
|
|
two things:
|
|
|
|
* skip over tokens
|
|
* split tokens into sub-tokens
|
|
|
|
Subclasses have two basic options for customising their behaviour. The
|
|
method _skip(word) may be overridden to return True for words that
|
|
should be skipped, and false otherwise. The method _split(word) may
|
|
be overridden as tokenization function that will be applied to further
|
|
tokenize any words that aren't skipped.
|
|
"""
|
|
|
|
def __init__(self, tokenizer):
|
|
"""Filter class constructor."""
|
|
self._tokenizer = tokenizer
|
|
|
|
def __call__(self, *args, **kwds):
|
|
tkn = self._tokenizer(*args, **kwds)
|
|
return self._TokenFilter(tkn, self._skip, self._split)
|
|
|
|
def _skip(self, word):
|
|
"""Filter method for identifying skippable tokens.
|
|
|
|
If this method returns true, the given word will be skipped by
|
|
the filter. This should be overridden in subclasses to produce the
|
|
desired functionality. The default behaviour is not to skip any words.
|
|
"""
|
|
return False
|
|
|
|
def _split(self, word):
|
|
"""Filter method for sub-tokenization of tokens.
|
|
|
|
This method must be a tokenization function that will split the
|
|
given word into sub-tokens according to the needs of the filter.
|
|
The default behaviour is not to split any words.
|
|
"""
|
|
return unit_tokenize(word)
|
|
|
|
class _TokenFilter(object):
|
|
"""Private inner class implementing the tokenizer-wrapping logic.
|
|
|
|
This might seem convoluted, but we're trying to create something
|
|
akin to a meta-class - when Filter(tknzr) is called it must return
|
|
a *callable* that can then be applied to a particular string to
|
|
perform the tokenization. Since we need to manage a lot of state
|
|
during tokenization, returning a class is the best option.
|
|
"""
|
|
_DOC_ERRORS = ["tknzr"]
|
|
|
|
def __init__(self, tokenizer, skip, split):
|
|
self._skip = skip
|
|
self._split = split
|
|
self._tokenizer = tokenizer
|
|
# for managing state of sub-tokenization
|
|
self._curtok = empty_tokenize()
|
|
self._curword = ""
|
|
self._curpos = 0
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
return self.next()
|
|
|
|
def next(self):
|
|
# Try to get the next sub-token from word currently being split.
|
|
# If unavailable, move on to the next word and try again.
|
|
try:
|
|
(word, pos) = next(self._curtok)
|
|
return (word, pos + self._curpos)
|
|
except StopIteration:
|
|
(word, pos) = next(self._tokenizer)
|
|
while self._skip(word):
|
|
(word, pos) = next(self._tokenizer)
|
|
self._curword = word
|
|
self._curpos = pos
|
|
self._curtok = self._split(word)
|
|
return self.next()
|
|
|
|
# Pass on access to 'offset' to the underlying tokenizer.
|
|
def _get_offset(self):
|
|
return self._tokenizer.offset
|
|
|
|
def _set_offset(self, offset):
|
|
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
|
|
" use the 'set_offset' method"
|
|
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
self.set_offset(offset)
|
|
|
|
offset = property(_get_offset, _set_offset)
|
|
|
|
def set_offset(self, val, replaced=False):
|
|
self._tokenizer.set_offset(val, replaced=replaced)
|
|
# If we stay within the current word, also set on _curtok.
|
|
# Otherwise, throw away _curtok and set to empty iterator.
|
|
subval = val - self._curpos
|
|
if subval >= 0 and subval < len(self._curword) and not replaced:
|
|
self._curtok.set_offset(subval)
|
|
else:
|
|
self._curtok = empty_tokenize()
|
|
self._curword = ""
|
|
self._curpos = 0
|
|
|
|
|
|
# Pre-defined chunkers and filters start here
|
|
|
|
class URLFilter(Filter):
|
|
"""Filter skipping over URLs.
|
|
This filter skips any words matching the following regular expression:
|
|
|
|
^[a-zA-z]+:\/\/[^\s].*
|
|
|
|
That is, any words that are URLs.
|
|
"""
|
|
_DOC_ERRORS = ["zA"]
|
|
_pattern = re.compile(r"^[a-zA-z]+:\/\/[^\s].*")
|
|
|
|
def _skip(self, word):
|
|
if self._pattern.match(word):
|
|
return True
|
|
return False
|
|
|
|
|
|
class WikiWordFilter(Filter):
|
|
"""Filter skipping over WikiWords.
|
|
This filter skips any words matching the following regular expression:
|
|
|
|
^([A-Z]\w+[A-Z]+\w+)
|
|
|
|
That is, any words that are WikiWords.
|
|
"""
|
|
_pattern = re.compile(r"^([A-Z]\w+[A-Z]+\w+)")
|
|
|
|
def _skip(self, word):
|
|
if self._pattern.match(word):
|
|
return True
|
|
return False
|
|
|
|
|
|
class EmailFilter(Filter):
|
|
"""Filter skipping over email addresses.
|
|
This filter skips any words matching the following regular expression:
|
|
|
|
^.+@[^\.].*\.[a-z]{2,}$
|
|
|
|
That is, any words that resemble email addresses.
|
|
"""
|
|
_pattern = re.compile(r"^.+@[^\.].*\.[a-z]{2,}$")
|
|
|
|
def _skip(self, word):
|
|
if self._pattern.match(word):
|
|
return True
|
|
return False
|
|
|
|
|
|
class HTMLChunker(Chunker):
|
|
"""Chunker for breaking up HTML documents into chunks of checkable text.
|
|
|
|
The operation of this chunker is very simple - anything between a "<"
|
|
and a ">" will be ignored. Later versions may improve the algorithm
|
|
slightly.
|
|
"""
|
|
|
|
def next(self):
|
|
text = self._text
|
|
offset = self.offset
|
|
while True:
|
|
if offset >= len(text):
|
|
break
|
|
# Skip to the end of the current tag, if any.
|
|
if text[offset] == "<":
|
|
maybeTag = offset
|
|
if self._is_tag(text, offset):
|
|
while text[offset] != ">":
|
|
offset += 1
|
|
if offset == len(text):
|
|
offset = maybeTag + 1
|
|
break
|
|
else:
|
|
offset += 1
|
|
else:
|
|
offset = maybeTag + 1
|
|
sPos = offset
|
|
# Find the start of the next tag.
|
|
while offset < len(text) and text[offset] != "<":
|
|
offset += 1
|
|
ePos = offset
|
|
self._offset = offset
|
|
# Return if chunk isnt empty
|
|
if (sPos < offset):
|
|
return (text[sPos:offset], sPos)
|
|
raise StopIteration()
|
|
|
|
def _is_tag(self, text, offset):
|
|
if offset + 1 < len(text):
|
|
if text[offset + 1].isalpha():
|
|
return True
|
|
if text[offset + 1] == "/":
|
|
return True
|
|
return False
|
|
|
|
# TODO: LaTeXChunker
|