Создание собственной универсальной функции#

У меня есть Сила!

— He-Man

Создание новой универсальной функции#

Перед чтением этого может помочь ознакомиться с основами C-расширений для Python, прочитав/просмотрев учебники в разделе 1 Расширение и встраивание интерпретатора Python и в Как расширить NumPy

Модуль umath — это сгенерированный компьютером C-модуль, создающий множество универсальных функций. Он предоставляет множество примеров создания универсальной функции. Создание собственной универсальной функции, которая будет использовать механизм универсальных функций, также не является сложным. Предположим, у вас есть функция, которую вы хотите применить поэлементно к её входным данным. Создав новую универсальную функцию, вы получите функцию, которая обрабатывает

вещание
N-мерное циклическое выполнение
автоматическое преобразование типов с минимальным использованием памяти
необязательные выходные массивы

Создать собственную универсальную функцию (ufunc) несложно. Для этого требуется одномерный цикл для каждого типа данных, который вы хотите поддерживать. Каждый одномерный цикл должен иметь определённую сигнатуру, и можно использовать только ufunc для типов данных фиксированного размера. Ниже приведён вызов функции для создания новой ufunc для работы со встроенными типами данных. Для регистрации ufunc для пользовательских типов данных используется другой механизм.

В следующих нескольких разделах мы приводим примеры кода, которые можно легко модифицировать для создания собственных универсальных функций. Примеры представляют собой последовательно более полные или сложные версии функции логита, часто используемой в статистическом моделировании. Логит также интересен тем, что благодаря магии стандартов IEEE (в частности, IEEE 754), все созданные ниже функции логита автоматически обладают следующим поведением.

>>> logit(0)
-inf
>>> logit(1)
inf
>>> logit(2)
nan
>>> logit(-2)
nan

Это замечательно, потому что автору функции не нужно вручную распространять inf или nan.

Пример расширения без ufunc#

Для сравнения и общего ознакомления читателя мы предоставляем простую реализацию C-расширения для logit который не использует numpy.

Для этого нам нужны три файла. Первый — это C-файл, содержащий фактический код, а другие — два файла проекта, описывающих как создать модуль.

#define PY_SSIZE_T_CLEAN
#include 
#include 

/*
 * spammodule.c
 * This is the C code for a non-numpy Python extension to
 * define the logit function, where logit(p) = log(p/(1-p)).
 * This function will not work on numpy arrays automatically.
 * numpy.vectorize must be called in python to generate
 * a numpy-friendly function.
 *
 * Details explaining the Python-C API can be found under
 * 'Extending and Embedding' and 'Python/C API' at
 * docs.python.org .
 */


/* This declares the logit function */
static PyObject *spam_logit(PyObject *self, PyObject *args);

/*
 * This tells Python what methods this module has.
 * See the Python-C API for more information.
 */
static PyMethodDef SpamMethods[] = {
    {"logit",
        spam_logit,
        METH_VARARGS, "compute logit"},
    {NULL, NULL, 0, NULL}
};

/*
 * This actually defines the logit function for
 * input args from Python.
 */

static PyObject *spam_logit(PyObject *self, PyObject *args)
{
    double p;

    /* This parses the Python argument into a double */
    if(!PyArg_ParseTuple(args, "d", &p)) {
        return NULL;
    }

    /* THE ACTUAL LOGIT FUNCTION */
    p = p/(1-p);
    p = log(p);

    /*This builds the answer back into a python object */
    return Py_BuildValue("d", p);
}

/* This initiates the module using the above definitions. */
static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "spam",
    NULL,
    -1,
    SpamMethods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_spam(void)
{
    PyObject *m;
    m = PyModule_Create(&moduledef);
    if (!m) {
        return NULL;
    }
    return m;
}

Чтобы создать модуль, действуют так же, как и для пакета Python, создавая pyproject.toml файл, который определяет бэкенд сборки, а затем другой файл для этого бэкенда, описывающий, как компилировать код. Для бэкенда мы рекомендуем meson-python, так как мы используем его для самого numpy, но ниже мы также показываем, как использовать более старый setuptools.

meson

Пример pyproject.toml и meson.build.

[project]
name = "spam"
version = "0.1"

[build-system]
requires = ["meson-python"]
build-backend = "mesonpy"

project('spam', 'c')

py = import('python').find_installation()

sources = files('spammodule.c')

extension_module = py.extension_module(
  'spam',
  sources,
  install: true,
)

setuptools

Пример pyproject.toml и setup.py.

[project]
name = "spam"
version = "0.1"

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

from setuptools import setup, Extension

spammodule = Extension('spam', sources=['spammodule.c'])

setup(name='spam', version='1.0',
      ext_modules=[spammodule])

С любым из вышеперечисленного можно собрать и установить spam пакет с,

pip install .

Как только spam модуль импортирован в python, вы можете вызвать logit через spam.logit. Обратите внимание, что функция, использованная выше, не может быть применена как есть к массивам numpy. Для этого мы должны вызвать numpy.vectorize на него. Например:

>>> import numpy as np
>>> import spam
>>> spam.logit(0)
-inf
>>> spam.logit(1)
inf
>>> spam.logit(0.5)
0.0
>>> x = np.linspace(0,1,10)
>>> spam.logit(x)
TypeError: only length-1 arrays can be converted to Python scalars
>>> f = np.vectorize(spam.logit)
>>> f(x)
array([       -inf, -2.07944154, -1.25276297, -0.69314718, -0.22314355,
    0.22314355,  0.69314718,  1.25276297,  2.07944154,         inf])

ПОЛУЧЕННАЯ ЛОГИТ-ФУНКЦИЯ НЕ БЫСТРАЯ! numpy.vectorize просто перебирает spam.logit. Цикл выполняется на уровне C, но массив numpy постоянно анализируется и перестраивается. Это дорого. Когда автор сравнил numpy.vectorize(spam.logit) по сравнению с логит-универсальными функциями, построенными ниже, логит-универсальные функции были почти в 4 раза быстрее. Большее или меньшее ускорение возможно, конечно, в зависимости от характера функции.

Пример NumPy ufunc для одного типа данных#

Для простоты мы даём ufunc для одного dtype, 'f8' double. Как и в предыдущем разделе, мы сначала приводим .c файл и затем файлы, используемые для создания npufunc модуль, содержащий ufunc.

Место в коде, соответствующее фактическим вычислениям для универсальной функции, отмечено /* BEGIN main ufunc computation */ и /* END main ufunc computation */. Код между этими строками является основным элементом, который необходимо изменить для создания собственной универсальной функции.

#define PY_SSIZE_T_CLEAN
#include 
#include "numpy/ndarraytypes.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_3kcompat.h"
#include 

/*
 * single_type_logit.c
 * This is the C code for creating your own
 * NumPy ufunc for a logit function.
 *
 * In this code we only define the ufunc for
 * a single dtype. The computations that must
 * be replaced to create a ufunc for
 * a different function are marked with BEGIN
 * and END.
 *
 * Details explaining the Python-C API can be found under
 * 'Extending and Embedding' and 'Python/C API' at
 * docs.python.org .
 */

static PyMethodDef LogitMethods[] = {
    {NULL, NULL, 0, NULL}
};

/* The loop definition must precede the PyMODINIT_FUNC. */

static void double_logit(char **args, const npy_intp *dimensions,
                         const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out = args[1];
    npy_intp in_step = steps[0], out_step = steps[1];

    double tmp;

    for (i = 0; i < n; i++) {
        /* BEGIN main ufunc computation */
        tmp = *(double *)in;
        tmp /= 1 - tmp;
        *((double *)out) = log(tmp);
        /* END main ufunc computation */

        in += in_step;
        out += out_step;
    }
}

/* This a pointer to the above function */
PyUFuncGenericFunction funcs[1] = {&double_logit};

/* These are the input and return dtypes of logit.*/
static const char types[2] = {NPY_DOUBLE, NPY_DOUBLE};

static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "npufunc",
    NULL,
    -1,
    LogitMethods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_npufunc(void)
{
    PyObject *m, *logit, *d;

    import_array();
    import_umath();

    m = PyModule_Create(&moduledef);
    if (!m) {
        return NULL;
    }

    logit = PyUFunc_FromFuncAndData(funcs, NULL, types, 1, 1, 1,
                                    PyUFunc_None, "logit",
                                    "logit_docstring", 0);

    d = PyModule_GetDict(m);

    PyDict_SetItemString(d, "logit", logit);
    Py_DECREF(logit);

    return m;
}

Для файлов, необходимых для создания модуля, основное отличие от нашего предыдущего примера заключается в том, что теперь нам нужно объявить зависимости от numpy.

meson

Пример pyproject.toml и meson.build.

[project]
name = "npufunc"
dependencies = ["numpy"]
version = "0.1"

[build-system]
requires = ["meson-python", "numpy"]
build-backend = "mesonpy"

project('npufunc', 'c')

py = import('python').find_installation()
np_dep = dependency('numpy')

sources = files('single_type_logit.c')

extension_module = py.extension_module(
  'npufunc',
  sources,
  dependencies: [np_dep],
  install: true,
)

setuptools

Пример pyproject.toml и setup.py.

[project]
name = "npufunc"
dependencies = ["numpy"]
version = "0.1"

[build-system]
requires = ["setuptools", "numpy"]
build-backend = "setuptools.build_meta"

from setuptools import setup, Extension
from numpy import get_include

npufunc = Extension('npufunc',
                    sources=['single_type_logit.c'],
                    include_dirs=[get_include()])

setup(name='npufunc', version='1.0', ext_modules=[npufunc])

После установки вышеуказанного, его можно импортировать и использовать следующим образом:

>>> import numpy as np
>>> import npufunc
>>> npufunc.logit(0.5)
np.float64(0.0)
>>> a = np.linspace(0, 1, 5)
>>> npufunc.logit(a)
array([       -inf, -1.09861229,  0.        ,  1.09861229,         inf])

Пример NumPy ufunc с несколькими типами данных#

Теперь расширим вышесказанное до полного logit универсальная функция с внутренними циклами для чисел с плавающей точкой, двойной точности и длинных двойных. Здесь мы можем использовать те же файлы сборки, что и выше, за исключением необходимости изменить исходный файл с single_type_logit.c to multi_type_logit.c.

Места в коде, соответствующие фактическим вычислениям для универсальной функции, отмечены /* BEGIN main ufunc computation */ и /* END main ufunc computation */Код между этими строками — это основное, что нужно изменить для создания собственной универсальной функции (ufunc).

#define PY_SSIZE_T_CLEAN
#include 
#include "numpy/ndarraytypes.h"
#include "numpy/ufuncobject.h"
#include 

/*
 * multi_type_logit.c
 * This is the C code for creating your own
 * NumPy ufunc for a logit function.
 *
 * Each function of the form type_logit defines the
 * logit function for a different numpy dtype. Each
 * of these functions must be modified when you
 * create your own ufunc. The computations that must
 * be replaced to create a ufunc for
 * a different function are marked with BEGIN
 * and END.
 *
 * Details explaining the Python-C API can be found under
 * 'Extending and Embedding' and 'Python/C API' at
 * docs.python.org .
 *
 */

static PyMethodDef LogitMethods[] = {
    {NULL, NULL, 0, NULL}
};

/* The loop definitions must precede the PyMODINIT_FUNC. */

static void long_double_logit(char **args, const npy_intp *dimensions,
                              const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out = args[1];
    npy_intp in_step = steps[0], out_step = steps[1];

    long double tmp;

    for (i = 0; i < n; i++) {
        /* BEGIN main ufunc computation */
        tmp = *(long double *)in;
        tmp /= 1 - tmp;
        *((long double *)out) = logl(tmp);
        /* END main ufunc computation */

        in += in_step;
        out += out_step;
    }
}

static void double_logit(char **args, const npy_intp *dimensions,
                         const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out = args[1];
    npy_intp in_step = steps[0], out_step = steps[1];

    double tmp;

    for (i = 0; i < n; i++) {
        /* BEGIN main ufunc computation */
        tmp = *(double *)in;
        tmp /= 1 - tmp;
        *((double *)out) = log(tmp);
        /* END main ufunc computation */

        in += in_step;
        out += out_step;
    }
}

static void float_logit(char **args, const npy_intp *dimensions,
                       const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out = args[1];
    npy_intp in_step = steps[0], out_step = steps[1];

    float tmp;

    for (i = 0; i < n; i++) {
        /* BEGIN main ufunc computation */
        tmp = *(float *)in;
        tmp /= 1 - tmp;
        *((float *)out) = logf(tmp);
        /* END main ufunc computation */

        in += in_step;
        out += out_step;
    }
}



/*This gives pointers to the above functions*/
PyUFuncGenericFunction funcs[3] = {&float_logit,
                                   &double_logit,
                                   &long_double_logit};

static const char types[6] = {NPY_FLOAT, NPY_FLOAT,
                              NPY_DOUBLE, NPY_DOUBLE,
                              NPY_LONGDOUBLE, NPY_LONGDOUBLE};

static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "npufunc",
    NULL,
    -1,
    LogitMethods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_npufunc(void)
{
    PyObject *m, *logit, *d;

    import_array();
    import_umath();

    m = PyModule_Create(&moduledef);
    if (!m) {
        return NULL;
    }

    logit = PyUFunc_FromFuncAndData(funcs, NULL, types, 4, 1, 1,
                                    PyUFunc_None, "logit",
                                    "logit_docstring", 0);

    d = PyModule_GetDict(m);

    PyDict_SetItemString(d, "logit", logit);
    Py_DECREF(logit);

    return m;
}

После установки вышеуказанного, это можно импортировать и использовать следующим образом.

>>> import numpy as np
>>> import npufunc
>>> npufunc.logit(0.5)
np.float64(0.0)
>>> a = np.linspace(0, 1, 5, dtype="f4")
>>> npufunc.logit(a)
:1: RuntimeWarning: divide by zero encountered in logit
array([      -inf, -1.0986123,  0.       ,  1.0986123,        inf],
      dtype=float32)

Примечание

Поддержка float16 (половинная точность) в пользовательских ufunc более сложна из-за его нестандартного C-представления и требований к преобразованию. Приведённый выше код может обрабатывать float16 вход, но сделает это, преобразовав его в float32. Результат будет float32 тоже, но можно преобразовать его обратно в float16 передавая подходящий выходной массив, как в npufunc.logit(a, out=np.empty_like(a)). Примеры реальных float16 циклы, см. исходный код numpy.

Пример NumPy ufunc с несколькими аргументами/возвращаемыми значениями#

Создание ufunc с несколькими аргументами не сложно. Здесь мы делаем модификацию кода для ufunc logit, где мы вычисляем (A * B, logit(A * B)). Для простоты мы создаём цикл только для double.

Мы снова приводим только код на C, так как файлы, необходимые для создания модуля, такие же, как и раньше, но с заменой имени исходного файла на multi_arg_logit.c.

C-файл приведён ниже. Сгенерированная ufunc принимает два аргумента A и B. Он возвращает кортеж, первый элемент которого A * B и чей второй элемент — logit(A * B). Обратите внимание, что он автоматически поддерживает broadcasting, а также все другие свойства универсальной функции.

#define PY_SSIZE_T_CLEAN
#include 
#include "numpy/ndarraytypes.h"
#include "numpy/ufuncobject.h"
#include 

/*
 * multi_arg_logit.c
 * This is the C code for creating your own
 * NumPy ufunc for a multiple argument, multiple
 * return value ufunc. The places where the
 * ufunc computation is carried out are marked
 * with comments.
 *
 * Details explaining the Python-C API can be found under
 * 'Extending and Embedding' and 'Python/C API' at
 * docs.python.org.
 */

static PyMethodDef LogitMethods[] = {
    {NULL, NULL, 0, NULL}
};

/* The loop definition must precede the PyMODINIT_FUNC. */

static void double_logitprod(char **args, const npy_intp *dimensions,
                             const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in1 = args[0], *in2 = args[1];
    char *out1 = args[2], *out2 = args[3];
    npy_intp in1_step = steps[0], in2_step = steps[1];
    npy_intp out1_step = steps[2], out2_step = steps[3];

    double tmp;

    for (i = 0; i < n; i++) {
        /* BEGIN main ufunc computation */
        tmp = *(double *)in1;
        tmp *= *(double *)in2;
        *((double *)out1) = tmp;
        *((double *)out2) = log(tmp / (1 - tmp));
        /* END main ufunc computation */

        in1 += in1_step;
        in2 += in2_step;
        out1 += out1_step;
        out2 += out2_step;
    }
}

/*This a pointer to the above function*/
PyUFuncGenericFunction funcs[1] = {&double_logitprod};

/* These are the input and return dtypes of logit.*/

static const char types[4] = {NPY_DOUBLE, NPY_DOUBLE,
                              NPY_DOUBLE, NPY_DOUBLE};

static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "npufunc",
    NULL,
    -1,
    LogitMethods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_npufunc(void)
{
    PyObject *m, *logit, *d;

    import_array();
    import_umath();

    m = PyModule_Create(&moduledef);
    if (!m) {
        return NULL;
    }

    logit = PyUFunc_FromFuncAndData(funcs, NULL, types, 1, 2, 2,
                                    PyUFunc_None, "logit",
                                    "logit_docstring", 0);

    d = PyModule_GetDict(m);

    PyDict_SetItemString(d, "logit", logit);
    Py_DECREF(logit);

    return m;
}

Пример NumPy ufunc с аргументами структурированного типа массива#

Этот пример показывает, как создать ufunc для структурированного типа данных массива. В примере показан тривиальный ufunc для сложения двух массивов с типом данных 'u8,u8,u8'. Процесс немного отличается от других примеров, поскольку вызов PyUFunc_FromFuncAndData не может регистрировать ufuncs для пользовательских типов данных и типов структурированных массивов. Нам также нужно вызвать PyUFunc_RegisterLoopForDescr чтобы завершить настройку универсальной функции.

Мы приводим только код C, поскольку файлы, необходимые для построения модуля, снова точно такие же, как и раньше, за исключением того, что исходный файл теперь add_triplet.c.

#define PY_SSIZE_T_CLEAN
#include 
#include "numpy/ndarraytypes.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_3kcompat.h"
#include 

/*
 * add_triplet.c
 * This is the C code for creating your own
 * NumPy ufunc for a structured array dtype.
 *
 * Details explaining the Python-C API can be found under
 * 'Extending and Embedding' and 'Python/C API' at
 * docs.python.org.
 */

static PyMethodDef StructUfuncTestMethods[] = {
    {NULL, NULL, 0, NULL}
};

/* The loop definition must precede the PyMODINIT_FUNC. */

static void add_uint64_triplet(char **args, const npy_intp *dimensions,
                               const npy_intp *steps, void *data)
{
    npy_intp i;
    npy_intp is1 = steps[0];
    npy_intp is2 = steps[1];
    npy_intp os = steps[2];
    npy_intp n = dimensions[0];
    uint64_t *x, *y, *z;

    char *i1 = args[0];
    char *i2 = args[1];
    char *op = args[2];

    for (i = 0; i < n; i++) {

        x = (uint64_t *)i1;
        y = (uint64_t *)i2;
        z = (uint64_t *)op;

        z[0] = x[0] + y[0];
        z[1] = x[1] + y[1];
        z[2] = x[2] + y[2];

        i1 += is1;
        i2 += is2;
        op += os;
    }
}

static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "npufunc",
    NULL,
    -1,
    StructUfuncTestMethods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_npufunc(void)
{
    PyObject *m, *add_triplet, *d;
    PyObject *dtype_dict;
    PyArray_Descr *dtype;
    PyArray_Descr *dtypes[3];

    import_array();
    import_umath();

    m = PyModule_Create(&moduledef);
    if (m == NULL) {
        return NULL;
    }

    /* Create a new ufunc object */
    add_triplet = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 2, 1,
                                          PyUFunc_None, "add_triplet",
                                          "add_triplet_docstring", 0);

    dtype_dict = Py_BuildValue("[(s, s), (s, s), (s, s)]",
                               "f0", "u8", "f1", "u8", "f2", "u8");
    PyArray_DescrConverter(dtype_dict, &dtype);
    Py_DECREF(dtype_dict);

    dtypes[0] = dtype;
    dtypes[1] = dtype;
    dtypes[2] = dtype;

    /* Register ufunc for structured dtype */
    PyUFunc_RegisterLoopForDescr((PyUFuncObject *)add_triplet,
                                 dtype,
                                 &add_uint64_triplet,
                                 dtypes,
                                 NULL);

    d = PyModule_GetDict(m);

    PyDict_SetItemString(d, "add_triplet", add_triplet);
    Py_DECREF(add_triplet);
    return m;
}

Пример использования:

>>> import npufunc
>>> import numpy as np
>>> a = np.array([(1, 2, 3), (4, 5, 6)], "u8,u8,u8")
>>> npufunc.add_triplet(a, a)
array([(2,  4,  6), (8, 10, 12)],
      dtype=[('f0', '