LCOV - CPython 3.12 LCOV report [commit acb105a7c1f]

LCOV - code coverage report

Current view:	top level - Modules - unicodedata.c (source / functions)		Hit	Total	Coverage
Test:	CPython 3.12 LCOV report [commit acb105a7c1f]	Lines:	559	629	88.9 %
Date:	2022-07-20 13:12:14	Functions:	35	35	100.0 %
		Branches:	357	449	79.5 %

           Branch data     Line data    Source code

       1                 :            : /* ------------------------------------------------------------------------
       2                 :            : 
       3                 :            :    unicodedata -- Provides access to the Unicode database.
       4                 :            : 
       5                 :            :    The current version number is reported in the unidata_version constant.
       6                 :            : 
       7                 :            :    Written by Marc-Andre Lemburg (mal@lemburg.com).
       8                 :            :    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       9                 :            :    Modified by Martin v. Löwis (martin@v.loewis.de)
      10                 :            : 
      11                 :            :    Copyright (c) Corporation for National Research Initiatives.
      12                 :            : 
      13                 :            :    ------------------------------------------------------------------------ */
      14                 :            : 
      15                 :            : #ifndef Py_BUILD_CORE_BUILTIN
      16                 :            : #  define Py_BUILD_CORE_MODULE 1
      17                 :            : #endif
      18                 :            : 
      19                 :            : #define PY_SSIZE_T_CLEAN
      20                 :            : 
      21                 :            : #include "Python.h"
      22                 :            : #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
      23                 :            : #include "structmember.h"         // PyMemberDef
      24                 :            : 
      25                 :            : #include <stdbool.h>
      26                 :            : 
      27                 :            : /*[clinic input]
      28                 :            : module unicodedata
      29                 :            : class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
      30                 :            : [clinic start generated code]*/
      31                 :            : /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
      32                 :            : 
      33                 :            : /* character properties */
      34                 :            : 
      35                 :            : typedef struct {
      36                 :            :     const unsigned char category;       /* index into
      37                 :            :                                            _PyUnicode_CategoryNames */
      38                 :            :     const unsigned char combining;      /* combining class value 0 - 255 */
      39                 :            :     const unsigned char bidirectional;  /* index into
      40                 :            :                                            _PyUnicode_BidirectionalNames */
      41                 :            :     const unsigned char mirrored;       /* true if mirrored in bidir mode */
      42                 :            :     const unsigned char east_asian_width;       /* index into
      43                 :            :                                                    _PyUnicode_EastAsianWidth */
      44                 :            :     const unsigned char normalization_quick_check; /* see is_normalized() */
      45                 :            : } _PyUnicode_DatabaseRecord;
      46                 :            : 
      47                 :            : typedef struct change_record {
      48                 :            :     /* sequence of fields should be the same as in merge_old_version */
      49                 :            :     const unsigned char bidir_changed;
      50                 :            :     const unsigned char category_changed;
      51                 :            :     const unsigned char decimal_changed;
      52                 :            :     const unsigned char mirrored_changed;
      53                 :            :     const unsigned char east_asian_width_changed;
      54                 :            :     const double numeric_changed;
      55                 :            : } change_record;
      56                 :            : 
      57                 :            : /* data file generated by Tools/unicode/makeunicodedata.py */
      58                 :            : #include "unicodedata_db.h"
      59                 :            : 
      60                 :            : static const _PyUnicode_DatabaseRecord*
      61                 :   12645811 : _getrecord_ex(Py_UCS4 code)
      62                 :            : {
      63                 :            :     int index;
      64         [ -  + ]:   12645811 :     if (code >= 0x110000)
      65                 :          0 :         index = 0;
      66                 :            :     else {
      67                 :   12645811 :         index = index1[(code>>SHIFT)];
      68                 :   12645811 :         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      69                 :            :     }
      70                 :            : 
      71                 :   12645811 :     return &_PyUnicode_Database_Records[index];
      72                 :            : }
      73                 :            : 
      74                 :            : /* ------------- Previous-version API ------------------------------------- */
      75                 :            : typedef struct previous_version {
      76                 :            :     PyObject_HEAD
      77                 :            :     const char *name;
      78                 :            :     const change_record* (*getrecord)(Py_UCS4);
      79                 :            :     Py_UCS4 (*normalization)(Py_UCS4);
      80                 :            : } PreviousDBVersion;
      81                 :            : 
      82                 :            : #include "clinic/unicodedata.c.h"
      83                 :            : 
      84                 :            : #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
      85                 :            : 
      86                 :            : static PyMemberDef DB_members[] = {
      87                 :            :         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
      88                 :            :         {NULL}
      89                 :            : };
      90                 :            : 
      91                 :            : // Check if self is an unicodedata.UCD instance.
      92                 :            : // If self is NULL (when the PyCapsule C API is used), return 0.
      93                 :            : // PyModule_Check() is used to avoid having to retrieve the ucd_type.
      94                 :            : // See unicodedata_functions comment to the rationale of this macro.
      95                 :            : #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
      96                 :            : 
      97                 :            : static PyObject*
      98                 :         81 : new_previous_version(PyTypeObject *ucd_type,
      99                 :            :                      const char*name, const change_record* (*getrecord)(Py_UCS4),
     100                 :            :                      Py_UCS4 (*normalization)(Py_UCS4))
     101                 :            : {
     102                 :            :     PreviousDBVersion *self;
     103                 :         81 :     self = PyObject_GC_New(PreviousDBVersion, ucd_type);
     104         [ -  + ]:         81 :     if (self == NULL)
     105                 :          0 :         return NULL;
     106                 :         81 :     self->name = name;
     107                 :         81 :     self->getrecord = getrecord;
     108                 :         81 :     self->normalization = normalization;
     109                 :         81 :     PyObject_GC_Track(self);
     110                 :         81 :     return (PyObject*)self;
     111                 :            : }
     112                 :            : 
     113                 :            : 
     114                 :            : /* --- Module API --------------------------------------------------------- */
     115                 :            : 
     116                 :            : /*[clinic input]
     117                 :            : unicodedata.UCD.decimal
     118                 :            : 
     119                 :            :     self: self
     120                 :            :     chr: int(accept={str})
     121                 :            :     default: object=NULL
     122                 :            :     /
     123                 :            : 
     124                 :            : Converts a Unicode character into its equivalent decimal value.
     125                 :            : 
     126                 :            : Returns the decimal value assigned to the character chr as integer.
     127                 :            : If no such value is defined, default is returned, or, if not given,
     128                 :            : ValueError is raised.
     129                 :            : [clinic start generated code]*/
     130                 :            : 
     131                 :            : static PyObject *
     132                 :    1179655 : unicodedata_UCD_decimal_impl(PyObject *self, int chr,
     133                 :            :                              PyObject *default_value)
     134                 :            : /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
     135                 :            : {
     136                 :    1179655 :     int have_old = 0;
     137                 :            :     long rc;
     138                 :    1179655 :     Py_UCS4 c = (Py_UCS4)chr;
     139                 :            : 
     140   [ +  -  -  + ]:    1179655 :     if (UCD_Check(self)) {
     141                 :          0 :         const change_record *old = get_old_record(self, c);
     142         [ #  # ]:          0 :         if (old->category_changed == 0) {
     143                 :            :             /* unassigned */
     144                 :          0 :             have_old = 1;
     145                 :          0 :             rc = -1;
     146                 :            :         }
     147         [ #  # ]:          0 :         else if (old->decimal_changed != 0xFF) {
     148                 :          0 :             have_old = 1;
     149                 :          0 :             rc = old->decimal_changed;
     150                 :            :         }
     151                 :            :     }
     152                 :            : 
     153         [ +  - ]:    1179655 :     if (!have_old)
     154                 :    1179655 :         rc = Py_UNICODE_TODECIMAL(c);
     155         [ +  + ]:    1179655 :     if (rc < 0) {
     156         [ +  + ]:    1178623 :         if (default_value == NULL) {
     157                 :          1 :             PyErr_SetString(PyExc_ValueError,
     158                 :            :                             "not a decimal");
     159                 :          1 :             return NULL;
     160                 :            :         }
     161                 :            :         else {
     162                 :    1178622 :             Py_INCREF(default_value);
     163                 :    1178622 :             return default_value;
     164                 :            :         }
     165                 :            :     }
     166                 :       1032 :     return PyLong_FromLong(rc);
     167                 :            : }
     168                 :            : 
     169                 :            : /*[clinic input]
     170                 :            : unicodedata.UCD.digit
     171                 :            : 
     172                 :            :     self: self
     173                 :            :     chr: int(accept={str})
     174                 :            :     default: object=NULL
     175                 :            :     /
     176                 :            : 
     177                 :            : Converts a Unicode character into its equivalent digit value.
     178                 :            : 
     179                 :            : Returns the digit value assigned to the character chr as integer.
     180                 :            : If no such value is defined, default is returned, or, if not given,
     181                 :            : ValueError is raised.
     182                 :            : [clinic start generated code]*/
     183                 :            : 
     184                 :            : static PyObject *
     185                 :    1179655 : unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
     186                 :            : /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
     187                 :            : {
     188                 :            :     long rc;
     189                 :    1179655 :     Py_UCS4 c = (Py_UCS4)chr;
     190                 :    1179655 :     rc = Py_UNICODE_TODIGIT(c);
     191         [ +  + ]:    1179655 :     if (rc < 0) {
     192         [ +  + ]:    1178399 :         if (default_value == NULL) {
     193                 :          1 :             PyErr_SetString(PyExc_ValueError, "not a digit");
     194                 :          1 :             return NULL;
     195                 :            :         }
     196                 :            :         else {
     197                 :    1178398 :             Py_INCREF(default_value);
     198                 :    1178398 :             return default_value;
     199                 :            :         }
     200                 :            :     }
     201                 :       1256 :     return PyLong_FromLong(rc);
     202                 :            : }
     203                 :            : 
     204                 :            : /*[clinic input]
     205                 :            : unicodedata.UCD.numeric
     206                 :            : 
     207                 :            :     self: self
     208                 :            :     chr: int(accept={str})
     209                 :            :     default: object=NULL
     210                 :            :     /
     211                 :            : 
     212                 :            : Converts a Unicode character into its equivalent numeric value.
     213                 :            : 
     214                 :            : Returns the numeric value assigned to the character chr as float.
     215                 :            : If no such value is defined, default is returned, or, if not given,
     216                 :            : ValueError is raised.
     217                 :            : [clinic start generated code]*/
     218                 :            : 
     219                 :            : static PyObject *
     220                 :    1114955 : unicodedata_UCD_numeric_impl(PyObject *self, int chr,
     221                 :            :                              PyObject *default_value)
     222                 :            : /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
     223                 :            : {
     224                 :    1114955 :     int have_old = 0;
     225                 :            :     double rc;
     226                 :    1114955 :     Py_UCS4 c = (Py_UCS4)chr;
     227                 :            : 
     228   [ +  -  -  + ]:    1114955 :     if (UCD_Check(self)) {
     229                 :          0 :         const change_record *old = get_old_record(self, c);
     230         [ #  # ]:          0 :         if (old->category_changed == 0) {
     231                 :            :             /* unassigned */
     232                 :          0 :             have_old = 1;
     233                 :          0 :             rc = -1.0;
     234                 :            :         }
     235         [ #  # ]:          0 :         else if (old->decimal_changed != 0xFF) {
     236                 :          0 :             have_old = 1;
     237                 :          0 :             rc = old->decimal_changed;
     238                 :            :         }
     239                 :            :     }
     240                 :            : 
     241         [ +  - ]:    1114955 :     if (!have_old)
     242                 :    1114955 :         rc = Py_UNICODE_TONUMERIC(c);
     243         [ +  + ]:    1114955 :     if (rc == -1.0) {
     244         [ +  + ]:    1112243 :         if (default_value == NULL) {
     245                 :          1 :             PyErr_SetString(PyExc_ValueError, "not a numeric character");
     246                 :          1 :             return NULL;
     247                 :            :         }
     248                 :            :         else {
     249                 :    1112242 :             Py_INCREF(default_value);
     250                 :    1112242 :             return default_value;
     251                 :            :         }
     252                 :            :     }
     253                 :       2712 :     return PyFloat_FromDouble(rc);
     254                 :            : }
     255                 :            : 
     256                 :            : /*[clinic input]
     257                 :            : unicodedata.UCD.category
     258                 :            : 
     259                 :            :     self: self
     260                 :            :     chr: int(accept={str})
     261                 :            :     /
     262                 :            : 
     263                 :            : Returns the general category assigned to the character chr as string.
     264                 :            : [clinic start generated code]*/
     265                 :            : 
     266                 :            : static PyObject *
     267                 :    2229039 : unicodedata_UCD_category_impl(PyObject *self, int chr)
     268                 :            : /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
     269                 :            : {
     270                 :            :     int index;
     271                 :    2229039 :     Py_UCS4 c = (Py_UCS4)chr;
     272                 :    2229039 :     index = (int) _getrecord_ex(c)->category;
     273   [ +  -  +  + ]:    2229039 :     if (UCD_Check(self)) {
     274                 :        810 :         const change_record *old = get_old_record(self, c);
     275         [ +  + ]:        810 :         if (old->category_changed != 0xFF)
     276                 :          5 :             index = old->category_changed;
     277                 :            :     }
     278                 :    2229039 :     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
     279                 :            : }
     280                 :            : 
     281                 :            : /*[clinic input]
     282                 :            : unicodedata.UCD.bidirectional
     283                 :            : 
     284                 :            :     self: self
     285                 :            :     chr: int(accept={str})
     286                 :            :     /
     287                 :            : 
     288                 :            : Returns the bidirectional class assigned to the character chr as string.
     289                 :            : 
     290                 :            : If no such value is defined, an empty string is returned.
     291                 :            : [clinic start generated code]*/
     292                 :            : 
     293                 :            : static PyObject *
     294                 :    2228464 : unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
     295                 :            : /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
     296                 :            : {
     297                 :            :     int index;
     298                 :    2228464 :     Py_UCS4 c = (Py_UCS4)chr;
     299                 :    2228464 :     index = (int) _getrecord_ex(c)->bidirectional;
     300   [ +  -  +  + ]:    2228464 :     if (UCD_Check(self)) {
     301                 :        236 :         const change_record *old = get_old_record(self, c);
     302         [ -  + ]:        236 :         if (old->category_changed == 0)
     303                 :          0 :             index = 0; /* unassigned */
     304         [ -  + ]:        236 :         else if (old->bidir_changed != 0xFF)
     305                 :          0 :             index = old->bidir_changed;
     306                 :            :     }
     307                 :    2228464 :     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
     308                 :            : }
     309                 :            : 
     310                 :            : /*[clinic input]
     311                 :            : unicodedata.UCD.combining -> int
     312                 :            : 
     313                 :            :     self: self
     314                 :            :     chr: int(accept={str})
     315                 :            :     /
     316                 :            : 
     317                 :            : Returns the canonical combining class assigned to the character chr as integer.
     318                 :            : 
     319                 :            : Returns 0 if no combining class is defined.
     320                 :            : [clinic start generated code]*/
     321                 :            : 
     322                 :            : static int
     323                 :    1114116 : unicodedata_UCD_combining_impl(PyObject *self, int chr)
     324                 :            : /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
     325                 :            : {
     326                 :            :     int index;
     327                 :    1114116 :     Py_UCS4 c = (Py_UCS4)chr;
     328                 :    1114116 :     index = (int) _getrecord_ex(c)->combining;
     329   [ +  -  -  + ]:    1114116 :     if (UCD_Check(self)) {
     330                 :          0 :         const change_record *old = get_old_record(self, c);
     331         [ #  # ]:          0 :         if (old->category_changed == 0)
     332                 :          0 :             index = 0; /* unassigned */
     333                 :            :     }
     334                 :    1114116 :     return index;
     335                 :            : }
     336                 :            : 
     337                 :            : /*[clinic input]
     338                 :            : unicodedata.UCD.mirrored -> int
     339                 :            : 
     340                 :            :     self: self
     341                 :            :     chr: int(accept={str})
     342                 :            :     /
     343                 :            : 
     344                 :            : Returns the mirrored property assigned to the character chr as integer.
     345                 :            : 
     346                 :            : Returns 1 if the character has been identified as a "mirrored"
     347                 :            : character in bidirectional text, 0 otherwise.
     348                 :            : [clinic start generated code]*/
     349                 :            : 
     350                 :            : static int
     351                 :    1114118 : unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
     352                 :            : /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
     353                 :            : {
     354                 :            :     int index;
     355                 :    1114118 :     Py_UCS4 c = (Py_UCS4)chr;
     356                 :    1114118 :     index = (int) _getrecord_ex(c)->mirrored;
     357   [ +  -  +  + ]:    1114118 :     if (UCD_Check(self)) {
     358                 :          1 :         const change_record *old = get_old_record(self, c);
     359         [ -  + ]:          1 :         if (old->category_changed == 0)
     360                 :          0 :             index = 0; /* unassigned */
     361         [ +  - ]:          1 :         else if (old->mirrored_changed != 0xFF)
     362                 :          1 :             index = old->mirrored_changed;
     363                 :            :     }
     364                 :    1114118 :     return index;
     365                 :            : }
     366                 :            : 
     367                 :            : /*[clinic input]
     368                 :            : unicodedata.UCD.east_asian_width
     369                 :            : 
     370                 :            :     self: self
     371                 :            :     chr: int(accept={str})
     372                 :            :     /
     373                 :            : 
     374                 :            : Returns the east asian width assigned to the character chr as string.
     375                 :            : [clinic start generated code]*/
     376                 :            : 
     377                 :            : static PyObject *
     378                 :          9 : unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
     379                 :            : /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
     380                 :            : {
     381                 :            :     int index;
     382                 :          9 :     Py_UCS4 c = (Py_UCS4)chr;
     383                 :          9 :     index = (int) _getrecord_ex(c)->east_asian_width;
     384   [ +  -  +  + ]:          9 :     if (UCD_Check(self)) {
     385                 :          1 :         const change_record *old = get_old_record(self, c);
     386         [ -  + ]:          1 :         if (old->category_changed == 0)
     387                 :          0 :             index = 0; /* unassigned */
     388         [ +  - ]:          1 :         else if (old->east_asian_width_changed != 0xFF)
     389                 :          1 :             index = old->east_asian_width_changed;
     390                 :            :     }
     391                 :          9 :     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
     392                 :            : }
     393                 :            : 
     394                 :            : /*[clinic input]
     395                 :            : unicodedata.UCD.decomposition
     396                 :            : 
     397                 :            :     self: self
     398                 :            :     chr: int(accept={str})
     399                 :            :     /
     400                 :            : 
     401                 :            : Returns the character decomposition mapping assigned to the character chr as string.
     402                 :            : 
     403                 :            : An empty string is returned in case no such mapping is defined.
     404                 :            : [clinic start generated code]*/
     405                 :            : 
     406                 :            : static PyObject *
     407                 :    2233892 : unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
     408                 :            : /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
     409                 :            : {
     410                 :            :     char decomp[256];
     411                 :            :     int code, index, count;
     412                 :            :     size_t i;
     413                 :            :     unsigned int prefix_index;
     414                 :    2233892 :     Py_UCS4 c = (Py_UCS4)chr;
     415                 :            : 
     416                 :    2233892 :     code = (int)c;
     417                 :            : 
     418   [ +  -  -  + ]:    2233892 :     if (UCD_Check(self)) {
     419                 :          0 :         const change_record *old = get_old_record(self, c);
     420         [ #  # ]:          0 :         if (old->category_changed == 0)
     421                 :          0 :             return PyUnicode_FromString(""); /* unassigned */
     422                 :            :     }
     423                 :            : 
     424   [ +  -  -  + ]:    2233892 :     if (code < 0 || code >= 0x110000)
     425                 :          0 :         index = 0;
     426                 :            :     else {
     427                 :    2233892 :         index = decomp_index1[(code>>DECOMP_SHIFT)];
     428                 :    2233892 :         index = decomp_index2[(index<<DECOMP_SHIFT)+
     429                 :    2233892 :                              (code&((1<<DECOMP_SHIFT)-1))];
     430                 :            :     }
     431                 :            : 
     432                 :            :     /* high byte is number of hex bytes (usually one or two), low byte
     433                 :            :        is prefix code (from*/
     434                 :    2233892 :     count = decomp_data[index] >> 8;
     435                 :            : 
     436                 :            :     /* XXX: could allocate the PyString up front instead
     437                 :            :        (strlen(prefix) + 5 * count + 1 bytes) */
     438                 :            : 
     439                 :            :     /* Based on how index is calculated above and decomp_data is generated
     440                 :            :        from Tools/unicode/makeunicodedata.py, it should not be possible
     441                 :            :        to overflow decomp_prefix. */
     442                 :    2233892 :     prefix_index = decomp_data[index] & 255;
     443                 :            :     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
     444                 :            : 
     445                 :            :     /* copy prefix */
     446                 :    2233892 :     i = strlen(decomp_prefix[prefix_index]);
     447                 :    2233892 :     memcpy(decomp, decomp_prefix[prefix_index], i);
     448                 :            : 
     449         [ +  + ]:    2259698 :     while (count-- > 0) {
     450         [ +  + ]:      25806 :         if (i)
     451                 :      19623 :             decomp[i++] = ' ';
     452                 :            :         assert(i < sizeof(decomp));
     453                 :      25806 :         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
     454                 :      25806 :                       decomp_data[++index]);
     455                 :      25806 :         i += strlen(decomp + i);
     456                 :            :     }
     457                 :    2233892 :     return PyUnicode_FromStringAndSize(decomp, i);
     458                 :            : }
     459                 :            : 
     460                 :            : static void
     461                 :     375425 : get_decomp_record(PyObject *self, Py_UCS4 code,
     462                 :            :                   int *index, int *prefix, int *count)
     463                 :            : {
     464         [ -  + ]:     375425 :     if (code >= 0x110000) {
     465                 :          0 :         *index = 0;
     466                 :            :     }
     467   [ +  -  +  + ]:     375425 :     else if (UCD_Check(self)
     468         [ -  + ]:        891 :              && get_old_record(self, code)->category_changed==0) {
     469                 :            :         /* unassigned in old version */
     470                 :          0 :         *index = 0;
     471                 :            :     }
     472                 :            :     else {
     473                 :     375425 :         *index = decomp_index1[(code>>DECOMP_SHIFT)];
     474                 :     375425 :         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
     475                 :     375425 :                                (code&((1<<DECOMP_SHIFT)-1))];
     476                 :            :     }
     477                 :            : 
     478                 :            :     /* high byte is number of hex bytes (usually one or two), low byte
     479                 :            :        is prefix code (from*/
     480                 :     375425 :     *count = decomp_data[*index] >> 8;
     481                 :     375425 :     *prefix = decomp_data[*index] & 255;
     482                 :            : 
     483                 :     375425 :     (*index)++;
     484                 :     375425 : }
     485                 :            : 
     486                 :            : #define SBase   0xAC00
     487                 :            : #define LBase   0x1100
     488                 :            : #define VBase   0x1161
     489                 :            : #define TBase   0x11A7
     490                 :            : #define LCount  19
     491                 :            : #define VCount  21
     492                 :            : #define TCount  28
     493                 :            : #define NCount  (VCount*TCount)
     494                 :            : #define SCount  (LCount*NCount)
     495                 :            : 
     496                 :            : static PyObject*
     497                 :     173824 : nfd_nfkd(PyObject *self, PyObject *input, int k)
     498                 :            : {
     499                 :            :     PyObject *result;
     500                 :            :     Py_UCS4 *output;
     501                 :            :     Py_ssize_t i, o, osize;
     502                 :            :     int kind;
     503                 :            :     const void *data;
     504                 :            :     /* Longest decomposition in Unicode 3.2: U+FDFA */
     505                 :            :     Py_UCS4 stack[20];
     506                 :            :     Py_ssize_t space, isize;
     507                 :            :     int index, prefix, count, stackptr;
     508                 :            :     unsigned char prev, cur;
     509                 :            : 
     510                 :     173824 :     stackptr = 0;
     511                 :     173824 :     isize = PyUnicode_GET_LENGTH(input);
     512                 :     173824 :     space = isize;
     513                 :            :     /* Overallocate at most 10 characters. */
     514         [ +  + ]:     173824 :     if (space > 10) {
     515         [ +  - ]:        319 :         if (space <= PY_SSIZE_T_MAX - 10)
     516                 :        319 :             space += 10;
     517                 :            :     }
     518                 :            :     else {
     519                 :     173505 :         space *= 2;
     520                 :            :     }
     521                 :     173824 :     osize = space;
     522         [ +  - ]:     173824 :     output = PyMem_NEW(Py_UCS4, space);
     523         [ -  + ]:     173824 :     if (!output) {
     524                 :            :         PyErr_NoMemory();
     525                 :          0 :         return NULL;
     526                 :            :     }
     527                 :     173824 :     i = o = 0;
     528                 :     173824 :     kind = PyUnicode_KIND(input);
     529                 :     173824 :     data = PyUnicode_DATA(input);
     530                 :            : 
     531         [ +  + ]:     554043 :     while (i < isize) {
     532                 :     380219 :         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
     533         [ +  + ]:     824155 :         while(stackptr) {
     534                 :     443936 :             Py_UCS4 code = stack[--stackptr];
     535                 :            :             /* Hangul Decomposition adds three characters in
     536                 :            :                a single step, so we need at least that much room. */
     537         [ +  + ]:     443936 :             if (space < 3) {
     538                 :            :                 Py_UCS4 *new_output;
     539                 :     101096 :                 osize += 10;
     540                 :     101096 :                 space += 10;
     541                 :     101096 :                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
     542         [ -  + ]:     101096 :                 if (new_output == NULL) {
     543                 :          0 :                     PyMem_Free(output);
     544                 :            :                     PyErr_NoMemory();
     545                 :          0 :                     return NULL;
     546                 :            :                 }
     547                 :     101096 :                 output = new_output;
     548                 :            :             }
     549                 :            :             /* Hangul Decomposition. */
     550   [ +  +  +  + ]:     443936 :             if (SBase <= code && code < (SBase+SCount)) {
     551                 :      68511 :                 int SIndex = code - SBase;
     552                 :      68511 :                 int L = LBase + SIndex / NCount;
     553                 :      68511 :                 int V = VBase + (SIndex % NCount) / TCount;
     554                 :      68511 :                 int T = TBase + SIndex % TCount;
     555                 :      68511 :                 output[o++] = L;
     556                 :      68511 :                 output[o++] = V;
     557                 :      68511 :                 space -= 2;
     558         [ +  + ]:      68511 :                 if (T != TBase) {
     559                 :      64657 :                     output[o++] = T;
     560                 :      64657 :                     space --;
     561                 :            :                 }
     562                 :      68511 :                 continue;
     563                 :            :             }
     564                 :            :             /* normalization changes */
     565   [ +  -  +  + ]:     375425 :             if (UCD_Check(self)) {
     566                 :        891 :                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
     567         [ -  + ]:        891 :                 if (value != 0) {
     568                 :          0 :                     stack[stackptr++] = value;
     569                 :          0 :                     continue;
     570                 :            :                 }
     571                 :            :             }
     572                 :            : 
     573                 :            :             /* Other decompositions. */
     574                 :     375425 :             get_decomp_record(self, code, &index, &prefix, &count);
     575                 :            : 
     576                 :            :             /* Copy character if it is not decomposable, or has a
     577                 :            :                compatibility decomposition, but we do NFD. */
     578   [ +  +  +  +  :     375425 :             if (!count || (prefix && !k)) {
                   +  + ]
     579                 :     334973 :                 output[o++] = code;
     580                 :     334973 :                 space--;
     581                 :     334973 :                 continue;
     582                 :            :             }
     583                 :            :             /* Copy decomposition onto the stack, in reverse
     584                 :            :                order.  */
     585         [ +  + ]:     104169 :             while(count) {
     586                 :      63717 :                 code = decomp_data[index + (--count)];
     587                 :      63717 :                 stack[stackptr++] = code;
     588                 :            :             }
     589                 :            :         }
     590                 :            :     }
     591                 :            : 
     592                 :     173824 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     593                 :            :                                        output, o);
     594                 :     173824 :     PyMem_Free(output);
     595         [ -  + ]:     173824 :     if (!result)
     596                 :          0 :         return NULL;
     597                 :            :     /* result is guaranteed to be ready, as it is compact. */
     598                 :     173824 :     kind = PyUnicode_KIND(result);
     599                 :     173824 :     data = PyUnicode_DATA(result);
     600                 :            : 
     601                 :            :     /* Sort canonically. */
     602                 :     173824 :     i = 0;
     603                 :     173824 :     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     604         [ +  + ]:     536652 :     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
     605                 :     362828 :         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     606   [ +  +  +  +  :     362828 :         if (prev == 0 || cur == 0 || prev <= cur) {
                   +  + ]
     607                 :     342243 :             prev = cur;
     608                 :     342243 :             continue;
     609                 :            :         }
     610                 :            :         /* Non-canonical order. Need to switch *i with previous. */
     611                 :      20585 :         o = i - 1;
     612                 :      10580 :         while (1) {
     613                 :      31165 :             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
     614                 :      31165 :             PyUnicode_WRITE(kind, data, o+1,
     615                 :            :                             PyUnicode_READ(kind, data, o));
     616                 :      31165 :             PyUnicode_WRITE(kind, data, o, tmp);
     617                 :      31165 :             o--;
     618         [ +  + ]:      31165 :             if (o < 0)
     619                 :         12 :                 break;
     620                 :      31153 :             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
     621   [ +  +  +  + ]:      31153 :             if (prev == 0 || prev <= cur)
     622                 :            :                 break;
     623                 :            :         }
     624                 :      20585 :         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     625                 :            :     }
     626                 :     173824 :     return result;
     627                 :            : }
     628                 :            : 
     629                 :            : static int
     630                 :     177217 : find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
     631                 :            : {
     632                 :            :     unsigned int index;
     633         [ +  + ]:    8361076 :     for (index = 0; nfc[index].start; index++) {
     634                 :    8355374 :         unsigned int start = nfc[index].start;
     635         [ +  + ]:    8355374 :         if (code < start)
     636                 :     114224 :             return -1;
     637         [ +  + ]:    8241150 :         if (code <= start + nfc[index].count) {
     638                 :      57291 :             unsigned int delta = code - start;
     639                 :      57291 :             return nfc[index].index + delta;
     640                 :            :         }
     641                 :            :     }
     642                 :       5702 :     return -1;
     643                 :            : }
     644                 :            : 
     645                 :            : static PyObject*
     646                 :      80612 : nfc_nfkc(PyObject *self, PyObject *input, int k)
     647                 :            : {
     648                 :            :     PyObject *result;
     649                 :            :     int kind;
     650                 :            :     const void *data;
     651                 :            :     Py_UCS4 *output;
     652                 :            :     Py_ssize_t i, i1, o, len;
     653                 :            :     int f,l,index,index1,comb;
     654                 :            :     Py_UCS4 code;
     655                 :            :     Py_ssize_t skipped[20];
     656                 :      80612 :     int cskipped = 0;
     657                 :            : 
     658                 :      80612 :     result = nfd_nfkd(self, input, k);
     659         [ -  + ]:      80612 :     if (!result)
     660                 :          0 :         return NULL;
     661                 :            :     /* result will be "ready". */
     662                 :      80612 :     kind = PyUnicode_KIND(result);
     663                 :      80612 :     data = PyUnicode_DATA(result);
     664                 :      80612 :     len = PyUnicode_GET_LENGTH(result);
     665                 :            : 
     666                 :            :     /* We allocate a buffer for the output.
     667                 :            :        If we find that we made no changes, we still return
     668                 :            :        the NFD result. */
     669         [ +  - ]:      80612 :     output = PyMem_NEW(Py_UCS4, len);
     670         [ -  + ]:      80612 :     if (!output) {
     671                 :            :         PyErr_NoMemory();
     672                 :          0 :         Py_DECREF(result);
     673                 :          0 :         return 0;
     674                 :            :     }
     675                 :      80612 :     i = o = 0;
     676                 :            : 
     677                 :      90225 :   again:
     678         [ +  + ]:     254358 :     while (i < len) {
     679         [ +  + ]:     177581 :       for (index = 0; index < cskipped; index++) {
     680         [ +  + ]:      13448 :           if (skipped[index] == i) {
     681                 :            :               /* *i character is skipped.
     682                 :            :                  Remove from list. */
     683                 :       9613 :               skipped[index] = skipped[cskipped-1];
     684                 :       9613 :               cskipped--;
     685                 :       9613 :               i++;
     686                 :       9613 :               goto again; /* continue while */
     687                 :            :           }
     688                 :            :       }
     689                 :            :       /* Hangul Composition. We don't need to check for <LV,T>
     690                 :            :          pairs, since we always have decomposed data. */
     691                 :     164133 :       code = PyUnicode_READ(kind, data, i);
     692   [ +  +  +  + ]:     164133 :       if (LBase <= code && code < (LBase+LCount) &&
     693   [ +  +  +  + ]:      93278 :           i + 1 < len &&
     694         [ +  + ]:      92657 :           VBase <= PyUnicode_READ(kind, data, i+1) &&
     695                 :      46096 :           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
     696                 :            :           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
     697                 :            :              and V character is a modern vowel (0x1161 ~ 0x1175). */
     698                 :            :           int LIndex, VIndex;
     699                 :      46095 :           LIndex = code - LBase;
     700                 :      46095 :           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
     701                 :      46095 :           code = SBase + (LIndex*VCount+VIndex)*TCount;
     702                 :      46095 :           i+=2;
     703   [ +  +  +  + ]:      90509 :           if (i < len &&
     704         [ +  + ]:      87536 :               TBase < PyUnicode_READ(kind, data, i) &&
     705                 :      43122 :               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
     706                 :            :               /* check T character is a modern trailing consonant
     707                 :            :                  (0x11A8 ~ 0x11C2). */
     708                 :      43121 :               code += PyUnicode_READ(kind, data, i)-TBase;
     709                 :      43121 :               i++;
     710                 :            :           }
     711                 :      46095 :           output[o++] = code;
     712                 :      46095 :           continue;
     713                 :            :       }
     714                 :            : 
     715                 :            :       /* code is still input[i] here */
     716                 :     118038 :       f = find_nfc_index(nfc_first, code);
     717         [ +  + ]:     118038 :       if (f == -1) {
     718                 :      75997 :           output[o++] = code;
     719                 :      75997 :           i++;
     720                 :      75997 :           continue;
     721                 :            :       }
     722                 :            :       /* Find next unblocked character. */
     723                 :      42041 :       i1 = i+1;
     724                 :      42041 :       comb = 0;
     725                 :            :       /* output base character for now; might be updated later. */
     726                 :      42041 :       output[o] = PyUnicode_READ(kind, data, i);
     727         [ +  + ]:      91157 :       while (i1 < len) {
     728                 :      71755 :           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
     729                 :      71755 :           int comb1 = _getrecord_ex(code1)->combining;
     730         [ +  + ]:      71755 :           if (comb) {
     731         [ +  + ]:      47258 :               if (comb1 == 0)
     732                 :      11367 :                   break;
     733         [ +  + ]:      35891 :               if (comb >= comb1) {
     734                 :            :                   /* Character is blocked. */
     735                 :      10822 :                   i1++;
     736                 :      10822 :                   continue;
     737                 :            :               }
     738                 :            :           }
     739                 :      49566 :           l = find_nfc_index(nfc_last, code1);
     740                 :            :           /* i1 cannot be combined with i. If i1
     741                 :            :              is a starter, we don't need to look further.
     742                 :            :              Otherwise, record the combining class. */
     743         [ +  + ]:      49566 :           if (l == -1) {
     744                 :      36231 :             not_combinable:
     745         [ +  + ]:      39953 :               if (comb1 == 0)
     746                 :       3574 :                   break;
     747                 :      36379 :               comb = comb1;
     748                 :      36379 :               i1++;
     749                 :      36379 :               continue;
     750                 :            :           }
     751                 :      13335 :           index = f*TOTAL_LAST + l;
     752                 :      13335 :           index1 = comp_index[index >> COMP_SHIFT];
     753                 :      13335 :           code = comp_data[(index1<<COMP_SHIFT)+
     754                 :      13335 :                            (index&((1<<COMP_SHIFT)-1))];
     755         [ +  + ]:      13335 :           if (code == 0)
     756                 :       3722 :               goto not_combinable;
     757                 :            : 
     758                 :            :           /* Replace the original character. */
     759                 :       9613 :           output[o] = code;
     760                 :            :           /* Mark the second character unused. */
     761                 :            :           assert(cskipped < 20);
     762                 :       9613 :           skipped[cskipped++] = i1;
     763                 :       9613 :           i1++;
     764                 :       9613 :           f = find_nfc_index(nfc_first, output[o]);
     765         [ +  + ]:       9613 :           if (f == -1)
     766                 :       7698 :               break;
     767                 :            :       }
     768                 :            :       /* Output character was already written.
     769                 :            :          Just advance the indices. */
     770                 :      42041 :       o++; i++;
     771                 :            :     }
     772         [ +  + ]:      80612 :     if (o == len) {
     773                 :            :         /* No changes. Return original string. */
     774                 :      26237 :         PyMem_Free(output);
     775                 :      26237 :         return result;
     776                 :            :     }
     777                 :      54375 :     Py_DECREF(result);
     778                 :      54375 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     779                 :            :                                        output, o);
     780                 :      54375 :     PyMem_Free(output);
     781                 :      54375 :     return result;
     782                 :            : }
     783                 :            : 
     784                 :            : // This needs to match the logic in makeunicodedata.py
     785                 :            : // which constructs the quickcheck data.
     786                 :            : typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
     787                 :            : 
     788                 :            : /* Run the Unicode normalization "quickcheck" algorithm.
     789                 :            :  *
     790                 :            :  * Return YES or NO if quickcheck determines the input is certainly
     791                 :            :  * normalized or certainly not, and MAYBE if quickcheck is unable to
     792                 :            :  * tell.
     793                 :            :  *
     794                 :            :  * If `yes_only` is true, then return MAYBE as soon as we determine
     795                 :            :  * the answer is not YES.
     796                 :            :  *
     797                 :            :  * For background and details on the algorithm, see UAX #15:
     798                 :            :  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
     799                 :            :  */
     800                 :            : static QuickcheckResult
     801                 :    4884428 : is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
     802                 :            :                          bool yes_only)
     803                 :            : {
     804                 :            :     /* UCD 3.2.0 is requested, quickchecks must be disabled. */
     805   [ +  -  +  + ]:    4884428 :     if (UCD_Check(self)) {
     806                 :        490 :         return NO;
     807                 :            :     }
     808                 :            : 
     809         [ +  + ]:    4883938 :     if (PyUnicode_IS_ASCII(input)) {
     810                 :      17450 :         return YES;
     811                 :            :     }
     812                 :            : 
     813                 :            :     Py_ssize_t i, len;
     814                 :            :     int kind;
     815                 :            :     const void *data;
     816                 :    4866488 :     unsigned char prev_combining = 0;
     817                 :            : 
     818                 :            :     /* The two quickcheck bits at this shift have type QuickcheckResult. */
     819   [ +  +  +  + ]:    4866488 :     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
     820                 :            : 
     821                 :    4866488 :     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
     822                 :            : 
     823                 :    4866488 :     i = 0;
     824                 :    4866488 :     kind = PyUnicode_KIND(input);
     825                 :    4866488 :     data = PyUnicode_DATA(input);
     826                 :    4866488 :     len = PyUnicode_GET_LENGTH(input);
     827         [ +  + ]:    9996174 :     while (i < len) {
     828                 :    5299920 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
     829                 :    5299920 :         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
     830                 :            : 
     831                 :    5299920 :         unsigned char combining = record->combining;
     832   [ +  +  +  + ]:    5299920 :         if (combining && prev_combining > combining)
     833                 :       7052 :             return NO; /* non-canonical sort order, not normalized */
     834                 :    5292868 :         prev_combining = combining;
     835                 :            : 
     836                 :    5292868 :         unsigned char quickcheck_whole = record->normalization_quick_check;
     837         [ +  + ]:    5292868 :         if (yes_only) {
     838         [ +  + ]:    5053483 :             if (quickcheck_whole & (3 << quickcheck_shift))
     839                 :     163182 :                 return MAYBE;
     840                 :            :         } else {
     841      [ -  +  + ]:     239385 :             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
     842                 :          0 :             case NO:
     843                 :          0 :               return NO;
     844                 :       3384 :             case MAYBE:
     845                 :       3384 :               result = MAYBE; /* this string might need normalization */
     846                 :            :             }
     847                 :            :         }
     848                 :            :     }
     849                 :    4696254 :     return result;
     850                 :            : }
     851                 :            : 
     852                 :            : /*[clinic input]
     853                 :            : unicodedata.UCD.is_normalized
     854                 :            : 
     855                 :            :     self: self
     856                 :            :     form: unicode
     857                 :            :     unistr as input: unicode
     858                 :            :     /
     859                 :            : 
     860                 :            : Return whether the Unicode string unistr is in the normal form 'form'.
     861                 :            : 
     862                 :            : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     863                 :            : [clinic start generated code]*/
     864                 :            : 
     865                 :            : static PyObject *
     866                 :     113952 : unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
     867                 :            :                                    PyObject *input)
     868                 :            : /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
     869                 :            : {
     870         [ -  + ]:     113952 :     if (PyUnicode_READY(input) == -1) {
     871                 :          0 :         return NULL;
     872                 :            :     }
     873                 :            : 
     874         [ -  + ]:     113952 :     if (PyUnicode_GET_LENGTH(input) == 0) {
     875                 :            :         /* special case empty input strings. */
     876                 :          0 :         Py_RETURN_TRUE;
     877                 :            :     }
     878                 :            : 
     879                 :            :     PyObject *result;
     880                 :     113952 :     bool nfc = false;
     881                 :     113952 :     bool k = false;
     882                 :            :     QuickcheckResult m;
     883                 :            : 
     884                 :            :     PyObject *cmp;
     885                 :     113952 :     int match = 0;
     886                 :            : 
     887         [ +  + ]:     113952 :     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     888                 :      37984 :         nfc = true;
     889                 :            :     }
     890         [ +  + ]:      75968 :     else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     891                 :      18992 :         nfc = true;
     892                 :      18992 :         k = true;
     893                 :            :     }
     894         [ +  + ]:      56976 :     else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     895                 :            :         /* matches default values for `nfc` and `k` */
     896                 :            :     }
     897         [ +  - ]:      18992 :     else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     898                 :      18992 :         k = true;
     899                 :            :     }
     900                 :            :     else {
     901                 :          0 :         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     902                 :          0 :         return NULL;
     903                 :            :     }
     904                 :            : 
     905                 :     113952 :     m = is_normalized_quickcheck(self, input, nfc, k, false);
     906                 :            : 
     907         [ +  + ]:     113952 :     if (m == MAYBE) {
     908         [ +  - ]:       3100 :         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
     909         [ -  + ]:       3100 :         if (cmp == NULL) {
     910                 :          0 :             return NULL;
     911                 :            :         }
     912                 :       3100 :         match = PyUnicode_Compare(input, cmp);
     913                 :       3100 :         Py_DECREF(cmp);
     914         [ +  - ]:       3100 :         result = (match == 0) ? Py_True : Py_False;
     915                 :            :     }
     916                 :            :     else {
     917         [ +  - ]:     110852 :         result = (m == YES) ? Py_True : Py_False;
     918                 :            :     }
     919                 :            : 
     920                 :     113952 :     Py_INCREF(result);
     921                 :     113952 :     return result;
     922                 :            : }
     923                 :            : 
     924                 :            : 
     925                 :            : /*[clinic input]
     926                 :            : unicodedata.UCD.normalize
     927                 :            : 
     928                 :            :     self: self
     929                 :            :     form: unicode
     930                 :            :     unistr as input: unicode
     931                 :            :     /
     932                 :            : 
     933                 :            : Return the normal form 'form' for the Unicode string unistr.
     934                 :            : 
     935                 :            : Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
     936                 :            : [clinic start generated code]*/
     937                 :            : 
     938                 :            : static PyObject *
     939                 :    4770480 : unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     940                 :            :                                PyObject *input)
     941                 :            : /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
     942                 :            : {
     943         [ +  + ]:    4770480 :     if (PyUnicode_GET_LENGTH(input) == 0) {
     944                 :            :         /* Special case empty input strings, since resizing
     945                 :            :            them  later would cause internal errors. */
     946                 :          3 :         Py_INCREF(input);
     947                 :          3 :         return input;
     948                 :            :     }
     949                 :            : 
     950         [ +  + ]:    4770477 :     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
     951         [ +  + ]:    1192324 :         if (is_normalized_quickcheck(self, input,
     952                 :            :                                      true,  false, true) == YES) {
     953                 :    1159505 :             Py_INCREF(input);
     954                 :    1159505 :             return input;
     955                 :            :         }
     956                 :      32819 :         return nfc_nfkc(self, input, 0);
     957                 :            :     }
     958         [ +  + ]:    3578153 :     if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
     959         [ +  + ]:    1193512 :         if (is_normalized_quickcheck(self, input,
     960                 :            :                                      true,  true,  true) == YES) {
     961                 :    1148819 :             Py_INCREF(input);
     962                 :    1148819 :             return input;
     963                 :            :         }
     964                 :      44693 :         return nfc_nfkc(self, input, 1);
     965                 :            :     }
     966         [ +  + ]:    2384641 :     if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
     967         [ +  + ]:    1192326 :         if (is_normalized_quickcheck(self, input,
     968                 :            :                                      false, false, true) == YES) {
     969                 :    1151340 :             Py_INCREF(input);
     970                 :    1151340 :             return input;
     971                 :            :         }
     972                 :      40986 :         return nfd_nfkd(self, input, 0);
     973                 :            :     }
     974         [ +  + ]:    1192315 :     if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
     975         [ +  + ]:    1192314 :         if (is_normalized_quickcheck(self, input,
     976                 :            :                                      false, true,  true) == YES) {
     977                 :    1140088 :             Py_INCREF(input);
     978                 :    1140088 :             return input;
     979                 :            :         }
     980                 :      52226 :         return nfd_nfkd(self, input, 1);
     981                 :            :     }
     982                 :          1 :     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     983                 :          1 :     return NULL;
     984                 :            : }
     985                 :            : 
     986                 :            : /* -------------------------------------------------------------------- */
     987                 :            : /* unicode character name tables */
     988                 :            : 
     989                 :            : /* data file generated by Tools/unicode/makeunicodedata.py */
     990                 :            : #include "unicodename_db.h"
     991                 :            : 
     992                 :            : /* -------------------------------------------------------------------- */
     993                 :            : /* database code (cut and pasted from the unidb package) */
     994                 :            : 
     995                 :            : static unsigned long
     996                 :      18410 : _gethash(const char *s, int len, int scale)
     997                 :            : {
     998                 :            :     int i;
     999                 :      18410 :     unsigned long h = 0;
    1000                 :            :     unsigned long ix;
    1001         [ +  + ]:     588990 :     for (i = 0; i < len; i++) {
    1002                 :     570580 :         h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
    1003                 :     570580 :         ix = h & 0xff000000;
    1004         [ +  + ]:     570580 :         if (ix)
    1005                 :     488798 :             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    1006                 :            :     }
    1007                 :      18410 :     return h;
    1008                 :            : }
    1009                 :            : 
    1010                 :            : static const char * const hangul_syllables[][3] = {
    1011                 :            :     { "G",  "A",   ""   },
    1012                 :            :     { "GG", "AE",  "G"  },
    1013                 :            :     { "N",  "YA",  "GG" },
    1014                 :            :     { "D",  "YAE", "GS" },
    1015                 :            :     { "DD", "EO",  "N", },
    1016                 :            :     { "R",  "E",   "NJ" },
    1017                 :            :     { "M",  "YEO", "NH" },
    1018                 :            :     { "B",  "YE",  "D"  },
    1019                 :            :     { "BB", "O",   "L"  },
    1020                 :            :     { "S",  "WA",  "LG" },
    1021                 :            :     { "SS", "WAE", "LM" },
    1022                 :            :     { "",   "OE",  "LB" },
    1023                 :            :     { "J",  "YO",  "LS" },
    1024                 :            :     { "JJ", "U",   "LT" },
    1025                 :            :     { "C",  "WEO", "LP" },
    1026                 :            :     { "K",  "WE",  "LH" },
    1027                 :            :     { "T",  "WI",  "M"  },
    1028                 :            :     { "P",  "YU",  "B"  },
    1029                 :            :     { "H",  "EU",  "BS" },
    1030                 :            :     { 0,    "YI",  "S"  },
    1031                 :            :     { 0,    "I",   "SS" },
    1032                 :            :     { 0,    0,     "NG" },
    1033                 :            :     { 0,    0,     "J"  },
    1034                 :            :     { 0,    0,     "C"  },
    1035                 :            :     { 0,    0,     "K"  },
    1036                 :            :     { 0,    0,     "T"  },
    1037                 :            :     { 0,    0,     "P"  },
    1038                 :            :     { 0,    0,     "H"  }
    1039                 :            : };
    1040                 :            : 
    1041                 :            : /* These ranges need to match makeunicodedata.py:cjk_ranges. */
    1042                 :            : static int
    1043                 :     127288 : is_unified_ideograph(Py_UCS4 code)
    1044                 :            : {
    1045                 :            :     return
    1046   [ +  +  +  + ]:     127288 :         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
    1047   [ +  +  +  + ]:     114102 :         (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
    1048   [ +  +  +  + ]:      72107 :         (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
    1049   [ +  +  +  + ]:      72105 :         (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
    1050   [ +  +  +  + ]:      72103 :         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
    1051   [ +  -  +  + ]:      72101 :         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
    1052   [ +  +  +  -  :     258757 :         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
                   +  + ]
    1053         [ +  + ]:       4181 :         (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
    1054                 :            : }
    1055                 :            : 
    1056                 :            : /* macros used to determine if the given code point is in the PUA range that
    1057                 :            :  * we are using to store aliases and named sequences */
    1058                 :            : #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
    1059                 :            : #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
    1060                 :            :                           (cp < named_sequences_end))
    1061                 :            : 
    1062                 :            : static int
    1063                 :     112952 : _getucname(PyObject *self,
    1064                 :            :            Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
    1065                 :            : {
    1066                 :            :     /* Find the name associated with the given code point.
    1067                 :            :      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
    1068                 :            :      * that we are using for aliases and named sequences. */
    1069                 :            :     int offset;
    1070                 :            :     int i;
    1071                 :            :     int word;
    1072                 :            :     const unsigned char* w;
    1073                 :            : 
    1074         [ -  + ]:     112952 :     if (code >= 0x110000)
    1075                 :          0 :         return 0;
    1076                 :            : 
    1077                 :            :     /* XXX should we just skip all the code points in the PUAs here? */
    1078   [ +  +  +  +  :     112952 :     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
          +  +  +  +  +  
                      + ]
    1079                 :        931 :         return 0;
    1080                 :            : 
    1081   [ +  +  +  + ]:     112021 :     if (UCD_Check(self)) {
    1082                 :            :         /* in 3.2.0 there are no aliases and named sequences */
    1083                 :            :         const change_record *old;
    1084   [ +  +  +  +  :       1580 :         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
             +  +  +  - ]
    1085                 :        524 :             return 0;
    1086                 :       1056 :         old = get_old_record(self, code);
    1087         [ +  + ]:       1056 :         if (old->category_changed == 0) {
    1088                 :            :             /* unassigned */
    1089                 :        632 :             return 0;
    1090                 :            :         }
    1091                 :            :     }
    1092                 :            : 
    1093   [ +  +  +  + ]:     110865 :     if (SBase <= code && code < SBase+SCount) {
    1094                 :            :         /* Hangul syllable. */
    1095                 :      11172 :         int SIndex = code - SBase;
    1096                 :      11172 :         int L = SIndex / NCount;
    1097                 :      11172 :         int V = (SIndex % NCount) / TCount;
    1098                 :      11172 :         int T = SIndex % TCount;
    1099                 :            : 
    1100         [ -  + ]:      11172 :         if (buflen < 27)
    1101                 :            :             /* Worst case: HANGUL SYLLABLE <10chars>. */
    1102                 :          0 :             return 0;
    1103                 :      11172 :         strcpy(buffer, "HANGUL SYLLABLE ");
    1104                 :      11172 :         buffer += 16;
    1105                 :      11172 :         strcpy(buffer, hangul_syllables[L][0]);
    1106                 :      11172 :         buffer += strlen(hangul_syllables[L][0]);
    1107                 :      11172 :         strcpy(buffer, hangul_syllables[V][1]);
    1108                 :      11172 :         buffer += strlen(hangul_syllables[V][1]);
    1109                 :      11172 :         strcpy(buffer, hangul_syllables[T][2]);
    1110                 :      11172 :         buffer += strlen(hangul_syllables[T][2]);
    1111                 :      11172 :         *buffer = '\0';
    1112                 :      11172 :         return 1;
    1113                 :            :     }
    1114                 :            : 
    1115         [ +  + ]:      99693 :     if (is_unified_ideograph(code)) {
    1116         [ -  + ]:      27593 :         if (buflen < 28)
    1117                 :            :             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
    1118                 :          0 :             return 0;
    1119                 :      27593 :         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
    1120                 :      27593 :         return 1;
    1121                 :            :     }
    1122                 :            : 
    1123                 :            :     /* get offset into phrasebook */
    1124                 :      72100 :     offset = phrasebook_offset1[(code>>phrasebook_shift)];
    1125                 :      72100 :     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
    1126                 :      72100 :                                (code&((1<<phrasebook_shift)-1))];
    1127         [ +  + ]:      72100 :     if (!offset)
    1128                 :      13168 :         return 0;
    1129                 :            : 
    1130                 :      58932 :     i = 0;
    1131                 :            : 
    1132                 :            :     for (;;) {
    1133                 :            :         /* get word index */
    1134                 :     229772 :         word = phrasebook[offset] - phrasebook_short;
    1135         [ +  + ]:     229772 :         if (word >= 0) {
    1136                 :      80778 :             word = (word << 8) + phrasebook[offset+1];
    1137                 :      80778 :             offset += 2;
    1138                 :            :         } else
    1139                 :     148994 :             word = phrasebook[offset++];
    1140         [ +  + ]:     229772 :         if (i) {
    1141         [ -  + ]:     170840 :             if (i > buflen)
    1142                 :          0 :                 return 0; /* buffer overflow */
    1143                 :     170840 :             buffer[i++] = ' ';
    1144                 :            :         }
    1145                 :            :         /* copy word string from lexicon.  the last character in the
    1146                 :            :            word has bit 7 set.  the last word in a string ends with
    1147                 :            :            0x80 */
    1148                 :     229772 :         w = lexicon + lexicon_offset[word];
    1149         [ +  + ]:    1297321 :         while (*w < 128) {
    1150         [ -  + ]:    1067549 :             if (i >= buflen)
    1151                 :          0 :                 return 0; /* buffer overflow */
    1152                 :    1067549 :             buffer[i++] = *w++;
    1153                 :            :         }
    1154         [ -  + ]:     229772 :         if (i >= buflen)
    1155                 :          0 :             return 0; /* buffer overflow */
    1156                 :     229772 :         buffer[i++] = *w & 127;
    1157         [ +  + ]:     229772 :         if (*w == 128)
    1158                 :      58932 :             break; /* end of word */
    1159                 :            :     }
    1160                 :            : 
    1161                 :      58932 :     return 1;
    1162                 :            : }
    1163                 :            : 
    1164                 :            : static int
    1165                 :      20060 : capi_getucname(Py_UCS4 code,
    1166                 :            :                char* buffer, int buflen,
    1167                 :            :                int with_alias_and_seq)
    1168                 :            : {
    1169                 :      20060 :     return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
    1170                 :            : 
    1171                 :            : }
    1172                 :            : 
    1173                 :            : static int
    1174                 :      23215 : _cmpname(PyObject *self, int code, const char* name, int namelen)
    1175                 :            : {
    1176                 :            :     /* check if code corresponds to the given name */
    1177                 :            :     int i;
    1178                 :            :     char buffer[NAME_MAXLEN+1];
    1179         [ +  + ]:      23215 :     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
    1180                 :       1156 :         return 0;
    1181         [ +  + ]:     483111 :     for (i = 0; i < namelen; i++) {
    1182         [ +  + ]:     465185 :         if (Py_TOUPPER(name[i]) != buffer[i])
    1183                 :       4133 :             return 0;
    1184                 :            :     }
    1185                 :      17926 :     return buffer[namelen] == '\0';
    1186                 :            : }
    1187                 :            : 
    1188                 :            : static void
    1189                 :      33555 : find_syllable(const char *str, int *len, int *pos, int count, int column)
    1190                 :            : {
    1191                 :            :     int i, len1;
    1192                 :      33555 :     *len = -1;
    1193         [ +  + ]:     794135 :     for (i = 0; i < count; i++) {
    1194                 :     760580 :         const char *s = hangul_syllables[i][column];
    1195                 :     760580 :         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
    1196         [ +  + ]:     760580 :         if (len1 <= *len)
    1197                 :     282823 :             continue;
    1198         [ +  + ]:     477757 :         if (strncmp(str, s, len1) == 0) {
    1199                 :      59655 :             *len = len1;
    1200                 :      59655 :             *pos = i;
    1201                 :            :         }
    1202                 :            :     }
    1203         [ -  + ]:      33555 :     if (*len == -1) {
    1204                 :          0 :         *len = 0;
    1205                 :            :     }
    1206                 :      33555 : }
    1207                 :            : 
    1208                 :            : static int
    1209                 :      17926 : _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
    1210                 :            : {
    1211                 :            :     /* check if named sequences are allowed */
    1212   [ +  +  +  +  :      17926 :     if (!with_named_seq && IS_NAMED_SEQ(cp))
                   +  - ]
    1213                 :        466 :         return 0;
    1214                 :            :     /* if the code point is in the PUA range that we use for aliases,
    1215                 :            :      * convert it to obtain the right code point */
    1216   [ +  +  +  + ]:      17460 :     if (IS_ALIAS(cp))
    1217                 :         22 :         *code = name_aliases[cp-aliases_start];
    1218                 :            :     else
    1219                 :      17438 :         *code = cp;
    1220                 :      17460 :     return 1;
    1221                 :            : }
    1222                 :            : 
    1223                 :            : static int
    1224                 :      57190 : _getcode(PyObject* self,
    1225                 :            :          const char* name, int namelen, Py_UCS4* code, int with_named_seq)
    1226                 :            : {
    1227                 :            :     /* Return the code point associated with the given name.
    1228                 :            :      * Named aliases are resolved too (unless self != NULL (i.e. we are using
    1229                 :            :      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
    1230                 :            :      * using for the named sequence, and the caller must then convert it. */
    1231                 :            :     unsigned int h, v;
    1232                 :      57190 :     unsigned int mask = code_size-1;
    1233                 :            :     unsigned int i, incr;
    1234                 :            : 
    1235                 :            :     /* Check for hangul syllables. */
    1236         [ +  + ]:      57190 :     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
    1237                 :      11185 :         int len, L = -1, V = -1, T = -1;
    1238                 :      11185 :         const char *pos = name + 16;
    1239                 :      11185 :         find_syllable(pos, &len, &L, LCount, 0);
    1240                 :      11185 :         pos += len;
    1241                 :      11185 :         find_syllable(pos, &len, &V, VCount, 1);
    1242                 :      11185 :         pos += len;
    1243                 :      11185 :         find_syllable(pos, &len, &T, TCount, 2);
    1244                 :      11185 :         pos += len;
    1245   [ +  -  +  -  :      11185 :         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
             +  -  +  - ]
    1246                 :      11185 :             *code = SBase + (L*VCount+V)*TCount + T;
    1247                 :      11185 :             return 1;
    1248                 :            :         }
    1249                 :            :         /* Otherwise, it's an illegal syllable name. */
    1250                 :          0 :         return 0;
    1251                 :            :     }
    1252                 :            : 
    1253                 :            :     /* Check for unified ideographs. */
    1254         [ +  + ]:      46005 :     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
    1255                 :            :         /* Four or five hexdigits must follow. */
    1256                 :      27595 :         v = 0;
    1257                 :      27595 :         name += 22;
    1258                 :      27595 :         namelen -= 22;
    1259   [ +  +  -  + ]:      27595 :         if (namelen != 4 && namelen != 5)
    1260                 :          0 :             return 0;
    1261         [ +  + ]:     137982 :         while (namelen--) {
    1262                 :     110387 :             v *= 16;
    1263   [ +  -  +  + ]:     110387 :             if (*name >= '0' && *name <= '9')
    1264                 :      79037 :                 v += *name - '0';
    1265   [ +  -  +  - ]:      31350 :             else if (*name >= 'A' && *name <= 'F')
    1266                 :      31350 :                 v += *name - 'A' + 10;
    1267                 :            :             else
    1268                 :          0 :                 return 0;
    1269                 :     110387 :             name++;
    1270                 :            :         }
    1271         [ -  + ]:      27595 :         if (!is_unified_ideograph(v))
    1272                 :          0 :             return 0;
    1273                 :      27595 :         *code = v;
    1274                 :      27595 :         return 1;
    1275                 :            :     }
    1276                 :            : 
    1277                 :            :     /* the following is the same as python's dictionary lookup, with
    1278                 :            :        only minor changes.  see the makeunicodedata script for more
    1279                 :            :        details */
    1280                 :            : 
    1281                 :      18410 :     h = (unsigned int) _gethash(name, namelen, code_magic);
    1282                 :      18410 :     i = (~h) & mask;
    1283                 :      18410 :     v = code_hash[i];
    1284         [ +  + ]:      18410 :     if (!v)
    1285                 :          2 :         return 0;
    1286         [ +  + ]:      18408 :     if (_cmpname(self, v, name, namelen)) {
    1287                 :      15274 :         return _check_alias_and_seq(v, code, with_named_seq);
    1288                 :            :     }
    1289                 :       3134 :     incr = (h ^ (h >> 3)) & mask;
    1290         [ -  + ]:       3134 :     if (!incr)
    1291                 :          0 :         incr = mask;
    1292                 :            :     for (;;) {
    1293                 :       5289 :         i = (i + incr) & mask;
    1294                 :       5289 :         v = code_hash[i];
    1295         [ +  + ]:       5289 :         if (!v)
    1296                 :        482 :             return 0;
    1297         [ +  + ]:       4807 :         if (_cmpname(self, v, name, namelen)) {
    1298                 :       2652 :             return _check_alias_and_seq(v, code, with_named_seq);
    1299                 :            :         }
    1300                 :       2155 :         incr = incr << 1;
    1301         [ +  + ]:       2155 :         if (incr > mask)
    1302                 :       1139 :             incr = incr ^ code_poly;
    1303                 :            :     }
    1304                 :            : }
    1305                 :            : 
    1306                 :            : static int
    1307                 :        620 : capi_getcode(const char* name, int namelen, Py_UCS4* code,
    1308                 :            :              int with_named_seq)
    1309                 :            : {
    1310                 :        620 :     return _getcode(NULL, name, namelen, code, with_named_seq);
    1311                 :            : 
    1312                 :            : }
    1313                 :            : 
    1314                 :            : static void
    1315                 :         81 : unicodedata_destroy_capi(PyObject *capsule)
    1316                 :            : {
    1317                 :         81 :     void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
    1318                 :         81 :     PyMem_Free(capi);
    1319                 :         81 : }
    1320                 :            : 
    1321                 :            : static PyObject *
    1322                 :         81 : unicodedata_create_capi(void)
    1323                 :            : {
    1324                 :         81 :     _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
    1325         [ -  + ]:         81 :     if (capi == NULL) {
    1326                 :            :         PyErr_NoMemory();
    1327                 :          0 :         return NULL;
    1328                 :            :     }
    1329                 :         81 :     capi->getname = capi_getucname;
    1330                 :         81 :     capi->getcode = capi_getcode;
    1331                 :            : 
    1332                 :         81 :     PyObject *capsule = PyCapsule_New(capi,
    1333                 :            :                                       PyUnicodeData_CAPSULE_NAME,
    1334                 :            :                                       unicodedata_destroy_capi);
    1335         [ -  + ]:         81 :     if (capsule == NULL) {
    1336                 :          0 :         PyMem_Free(capi);
    1337                 :            :     }
    1338                 :         81 :     return capsule;
    1339                 :            : };
    1340                 :            : 
    1341                 :            : 
    1342                 :            : /* -------------------------------------------------------------------- */
    1343                 :            : /* Python bindings */
    1344                 :            : 
    1345                 :            : /*[clinic input]
    1346                 :            : unicodedata.UCD.name
    1347                 :            : 
    1348                 :            :     self: self
    1349                 :            :     chr: int(accept={str})
    1350                 :            :     default: object=NULL
    1351                 :            :     /
    1352                 :            : 
    1353                 :            : Returns the name assigned to the character chr as a string.
    1354                 :            : 
    1355                 :            : If no name is defined, default is returned, or, if not given,
    1356                 :            : ValueError is raised.
    1357                 :            : [clinic start generated code]*/
    1358                 :            : 
    1359                 :            : static PyObject *
    1360                 :      69677 : unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
    1361                 :            : /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
    1362                 :            : {
    1363                 :            :     char name[NAME_MAXLEN+1];
    1364                 :      69677 :     Py_UCS4 c = (Py_UCS4)chr;
    1365                 :            : 
    1366         [ +  + ]:      69677 :     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
    1367         [ +  + ]:      14065 :         if (default_value == NULL) {
    1368                 :       4096 :             PyErr_SetString(PyExc_ValueError, "no such name");
    1369                 :       4096 :             return NULL;
    1370                 :            :         }
    1371                 :            :         else {
    1372                 :       9969 :             Py_INCREF(default_value);
    1373                 :       9969 :             return default_value;
    1374                 :            :         }
    1375                 :            :     }
    1376                 :            : 
    1377                 :      55612 :     return PyUnicode_FromString(name);
    1378                 :            : }
    1379                 :            : 
    1380                 :            : /*[clinic input]
    1381                 :            : unicodedata.UCD.lookup
    1382                 :            : 
    1383                 :            :     self: self
    1384                 :            :     name: str(accept={str, robuffer}, zeroes=True)
    1385                 :            :     /
    1386                 :            : 
    1387                 :            : Look up character by name.
    1388                 :            : 
    1389                 :            : If a character with the given name is found, return the
    1390                 :            : corresponding character.  If not found, KeyError is raised.
    1391                 :            : [clinic start generated code]*/
    1392                 :            : 
    1393                 :            : static PyObject *
    1394                 :      56570 : unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
    1395                 :            :                             Py_ssize_t name_length)
    1396                 :            : /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
    1397                 :            : {
    1398                 :            :     Py_UCS4 code;
    1399                 :            :     unsigned int index;
    1400         [ -  + ]:      56570 :     if (name_length > NAME_MAXLEN) {
    1401                 :          0 :         PyErr_SetString(PyExc_KeyError, "name too long");
    1402                 :          0 :         return NULL;
    1403                 :            :     }
    1404                 :            : 
    1405         [ +  + ]:      56570 :     if (!_getcode(self, name, (int)name_length, &code, 1)) {
    1406                 :        481 :         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
    1407                 :        481 :         return NULL;
    1408                 :            :     }
    1409                 :            :     /* check if code is in the PUA range that we use for named sequences
    1410                 :            :        and convert it */
    1411   [ +  +  +  - ]:      56089 :     if (IS_NAMED_SEQ(code)) {
    1412                 :        468 :         index = code-named_sequences_start;
    1413                 :        468 :         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
    1414                 :        468 :                                          named_sequences[index].seq,
    1415                 :        468 :                                          named_sequences[index].seqlen);
    1416                 :            :     }
    1417                 :      55621 :     return PyUnicode_FromOrdinal(code);
    1418                 :            : }
    1419                 :            : 
    1420                 :            : // List of functions used to define module functions *AND* unicodedata.UCD
    1421                 :            : // methods. For module functions, self is the module. For UCD methods, self
    1422                 :            : // is an UCD instance. The UCD_Check() macro is used to check if self is
    1423                 :            : // an UCD instance.
    1424                 :            : static PyMethodDef unicodedata_functions[] = {
    1425                 :            :     UNICODEDATA_UCD_DECIMAL_METHODDEF
    1426                 :            :     UNICODEDATA_UCD_DIGIT_METHODDEF
    1427                 :            :     UNICODEDATA_UCD_NUMERIC_METHODDEF
    1428                 :            :     UNICODEDATA_UCD_CATEGORY_METHODDEF
    1429                 :            :     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
    1430                 :            :     UNICODEDATA_UCD_COMBINING_METHODDEF
    1431                 :            :     UNICODEDATA_UCD_MIRRORED_METHODDEF
    1432                 :            :     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
    1433                 :            :     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
    1434                 :            :     UNICODEDATA_UCD_NAME_METHODDEF
    1435                 :            :     UNICODEDATA_UCD_LOOKUP_METHODDEF
    1436                 :            :     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
    1437                 :            :     UNICODEDATA_UCD_NORMALIZE_METHODDEF
    1438                 :            :     {NULL, NULL}                /* sentinel */
    1439                 :            : };
    1440                 :            : 
    1441                 :            : static int
    1442                 :       9538 : ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
    1443                 :            : {
    1444   [ +  -  -  + ]:       9538 :     Py_VISIT(Py_TYPE(self));
    1445                 :       9538 :     return 0;
    1446                 :            : }
    1447                 :            : 
    1448                 :            : static void
    1449                 :         81 : ucd_dealloc(PreviousDBVersion *self)
    1450                 :            : {
    1451                 :         81 :     PyTypeObject *tp = Py_TYPE(self);
    1452                 :         81 :     PyObject_GC_UnTrack(self);
    1453                 :         81 :     PyObject_GC_Del(self);
    1454                 :         81 :     Py_DECREF(tp);
    1455                 :         81 : }
    1456                 :            : 
    1457                 :            : static PyType_Slot ucd_type_slots[] = {
    1458                 :            :     {Py_tp_dealloc, ucd_dealloc},
    1459                 :            :     {Py_tp_traverse, ucd_traverse},
    1460                 :            :     {Py_tp_getattro, PyObject_GenericGetAttr},
    1461                 :            :     {Py_tp_methods, unicodedata_functions},
    1462                 :            :     {Py_tp_members, DB_members},
    1463                 :            :     {0, 0}
    1464                 :            : };
    1465                 :            : 
    1466                 :            : static PyType_Spec ucd_type_spec = {
    1467                 :            :     .name = "unicodedata.UCD",
    1468                 :            :     .basicsize = sizeof(PreviousDBVersion),
    1469                 :            :     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
    1470                 :            :               Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
    1471                 :            :     .slots = ucd_type_slots
    1472                 :            : };
    1473                 :            : 
    1474                 :            : PyDoc_STRVAR(unicodedata_docstring,
    1475                 :            : "This module provides access to the Unicode Character Database which\n\
    1476                 :            : defines character properties for all Unicode characters. The data in\n\
    1477                 :            : this database is based on the UnicodeData.txt file version\n\
    1478                 :            : " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
    1479                 :            : \n\
    1480                 :            : The module uses the same names and symbols as defined by the\n\
    1481                 :            : UnicodeData File Format " UNIDATA_VERSION ".");
    1482                 :            : 
    1483                 :            : static int
    1484                 :         81 : unicodedata_exec(PyObject *module)
    1485                 :            : {
    1486         [ -  + ]:         81 :     if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
    1487                 :          0 :         return -1;
    1488                 :            :     }
    1489                 :            : 
    1490                 :         81 :     PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
    1491         [ -  + ]:         81 :     if (ucd_type == NULL) {
    1492                 :          0 :         return -1;
    1493                 :            :     }
    1494                 :            : 
    1495         [ -  + ]:         81 :     if (PyModule_AddType(module, ucd_type) < 0) {
    1496                 :          0 :         Py_DECREF(ucd_type);
    1497                 :          0 :         return -1;
    1498                 :            :     }
    1499                 :            : 
    1500                 :            :     // Unicode database version 3.2.0 used by the IDNA encoding
    1501                 :            :     PyObject *v;
    1502                 :         81 :     v = new_previous_version(ucd_type, "3.2.0",
    1503                 :            :                              get_change_3_2_0, normalization_3_2_0);
    1504                 :         81 :     Py_DECREF(ucd_type);
    1505         [ -  + ]:         81 :     if (v == NULL) {
    1506                 :          0 :         return -1;
    1507                 :            :     }
    1508         [ -  + ]:         81 :     if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
    1509                 :          0 :         Py_DECREF(v);
    1510                 :          0 :         return -1;
    1511                 :            :     }
    1512                 :            : 
    1513                 :            :     /* Export C API */
    1514                 :         81 :     PyObject *capsule = unicodedata_create_capi();
    1515         [ -  + ]:         81 :     if (capsule == NULL) {
    1516                 :          0 :         return -1;
    1517                 :            :     }
    1518                 :         81 :     int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
    1519                 :         81 :     Py_DECREF(capsule);
    1520         [ -  + ]:         81 :     if (rc < 0) {
    1521                 :          0 :         return -1;
    1522                 :            :     }
    1523                 :         81 :     return 0;
    1524                 :            : }
    1525                 :            : 
    1526                 :            : static PyModuleDef_Slot unicodedata_slots[] = {
    1527                 :            :     {Py_mod_exec, unicodedata_exec},
    1528                 :            :     {0, NULL}
    1529                 :            : };
    1530                 :            : 
    1531                 :            : static struct PyModuleDef unicodedata_module = {
    1532                 :            :     PyModuleDef_HEAD_INIT,
    1533                 :            :     .m_name = "unicodedata",
    1534                 :            :     .m_doc = unicodedata_docstring,
    1535                 :            :     .m_size = 0,
    1536                 :            :     .m_methods = unicodedata_functions,
    1537                 :            :     .m_slots = unicodedata_slots,
    1538                 :            : };
    1539                 :            : 
    1540                 :            : PyMODINIT_FUNC
    1541                 :         81 : PyInit_unicodedata(void)
    1542                 :            : {
    1543                 :         81 :     return PyModuleDef_Init(&unicodedata_module);
    1544                 :            : }
    1545                 :            : 
    1546                 :            : 
    1547                 :            : /*
    1548                 :            : Local variables:
    1549                 :            : c-basic-offset: 4
    1550                 :            : indent-tabs-mode: nil
    1551                 :            : End:
    1552                 :            : */

Generated by: LCOV version 1.14