From e0b855d27ad342c4e785eec8ded9280cfc69fc67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D1=82=D0=B5=D0=BF=D0=B0=D0=BD=D0=B5=D0=BD=D0=BA?=
 =?UTF-8?q?=D0=BE=20=D0=9E=D0=BB=D1=8C=D0=B3=D0=B0?=
 <stepanenko_olga@mail.ru>
Date: Thu, 6 May 2021 14:37:08 +0300
Subject: [PATCH] Add ru tokenizer

---
 .../site-packages/enchant/tokenize/ru.py      | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 venv/lib/python3.6/site-packages/enchant/tokenize/ru.py
diff --git a/venv/lib/python3.6/site-packages/enchant/tokenize/ru.py b/venv/lib/python3.6/site-packages/enchant/tokenize/ru.py
new file mode 100644
index 0000000..7e15379
--- /dev/null
+++ b/venv/lib/python3.6/site-packages/enchant/tokenize/ru.py
@@ -0,0 +1,185 @@
+# pyenchant
+#
+# Copyright (C) 2004-2008, Ryan Kelly
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+#
+# In addition, as a special exception, you are
+# given permission to link the code of this program with
+# non-LGPL Spelling Provider libraries (eg: a MSFT Office
+# spell checker backend) and distribute linked combinations including
+# the two.  You must obey the GNU Lesser General Public License in all
+# respects for all of the code used other than said providers.  If you modify
+# this file, you may extend this exception to your version of the
+# file, but you are not obligated to do so.  If you do not wish to
+# do so, delete this exception statement from your version.
+#
+"""
+
+    enchant.tokenize.en:    Tokenizer for the English language
+
+    This module implements a PyEnchant text tokenizer for the English
+    language, based on very simple rules.
+
+"""
+
+import unicodedata
+
+import enchant.tokenize
+
+
+class tokenize(enchant.tokenize.tokenize):  # noqa: N801
+    """Iterator splitting text into words, reporting position.
+
+    This iterator takes a text string as input, and yields tuples
+    representing each distinct word found in the text.  The tuples
+    take the form:
+
+        (<word>,<pos>)
+
+    Where <word> is the word string found and <pos> is the position
+    of the start of the word within the text.
+
+    The optional argument <valid_chars> may be used to specify a
+    list of additional characters that can form part of a word.
+    By default, this list contains only the apostrophe ('). Note that
+    these characters cannot appear at the start or end of a word.
+    """
+
+    _DOC_ERRORS = ["pos", "pos"]
+
+    def __init__(self, text, valid_chars=None):
+        self._valid_chars = valid_chars
+        self._text = text
+        self._offset = 0
+        # Select proper implementation of self._consume_alpha.
+        # 'text' isn't necessarily a string (it could be e.g. a mutable array)
+        # so we can't use isinstance(text, str) to detect unicode.
+        # Instead we typetest the first character of the text.
+        # If there's no characters then it doesn't matter what implementation
+        # we use since it won't be called anyway.
+        try:
+            char1 = text[0]
+        except IndexError:
+            self._initialize_for_binary()
+        else:
+            if isinstance(char1, str):
+                self._initialize_for_unicode()
+            else:
+                self._initialize_for_binary()
+
+    def _initialize_for_binary(self):
+        self._consume_alpha = self._consume_alpha_b
+        if self._valid_chars is None:
+            self._valid_chars = ("'",)
+
+    def _initialize_for_unicode(self):
+        self._consume_alpha = self._consume_alpha_u
+        if self._valid_chars is None:
+            # XXX TODO: this doesn't seem to work correctly with the
+            # MySpell provider, disabling for now.
+            # Allow unicode typographic apostrophe
+            # self._valid_chars = (u"'",u"\u2019")
+            self._valid_chars = ("'",)
+
+    def _consume_alpha_b(self, text, offset):
+        """Consume an alphabetic character from the given bytestring.
+
+        Given a bytestring and the current offset, this method returns
+        the number of characters occupied by the next alphabetic character
+        in the string.  Non-ASCII bytes are interpreted as utf-8 and can
+        result in multiple characters being consumed.
+        """
+        assert offset < len(text)
+        if text[offset].isalpha():
+            return 1
+        elif text[offset] >= "\x80":
+            return self._consume_alpha_utf8(text, offset)
+        return 0
+
+    def _consume_alpha_utf8(self, text, offset):
+        """Consume a sequence of utf8 bytes forming an alphabetic character."""
+        incr = 2
+        u = ""
+        while not u and incr <= 4:
+            try:
+                try:
+                    #  In the common case this will be a string
+                    u = text[offset : offset + incr].decode("utf8")
+                except AttributeError:
+                    #  Looks like it was e.g. a mutable char array.
+                    try:
+                        s = text[offset : offset + incr].tostring()
+                    except AttributeError:
+                        s = "".join([c for c in text[offset : offset + incr]])
+                    u = s.decode("utf8")
+            except UnicodeDecodeError:
+                incr += 1
+        if not u:
+            return 0
+        if u.isalpha():
+            return incr
+        if unicodedata.category(u)[0] == "M":
+            return incr
+        return 0
+
+    def _consume_alpha_u(self, text, offset):
+        """Consume an alphabetic character from the given unicode string.
+
+        Given a unicode string and the current offset, this method returns
+        the number of characters occupied by the next alphabetic character
+        in the string.  Trailing combining characters are consumed as a
+        single letter.
+        """
+        assert offset < len(text)
+        incr = 0
+        if text[offset].isalpha():
+            incr = 1
+            while offset + incr < len(text):
+                if unicodedata.category(text[offset + incr])[0] != "M":
+                    break
+                incr += 1
+        return incr
+
+    def next(self):
+        text = self._text
+        offset = self._offset
+        while offset < len(text):
+            # Find start of next word (must be alpha)
+            while offset < len(text):
+                incr = self._consume_alpha(text, offset)
+                if incr:
+                    break
+                offset += 1
+            cur_pos = offset
+            # Find end of word using, allowing valid_chars
+            while offset < len(text):
+                incr = self._consume_alpha(text, offset)
+                if not incr:
+                    if text[offset] in self._valid_chars:
+                        incr = 1
+                    else:
+                        break
+                offset += incr
+            # Return if word isn't empty
+            if cur_pos != offset:
+                # Make sure word doesn't end with a valid_char
+                while text[offset - 1] in self._valid_chars:
+                    offset = offset - 1
+                self._offset = offset
+                return (text[cur_pos:offset], cur_pos)
+        self._offset = offset
+        raise StopIteration()