NumPy 通用函数(ufunc)的实现

NumPy 通用函数（ufunc）是 NumPy 的核心特性之一，我将从设计思想、实现机制和创建自定义 ufunc 几个方面来详细解析：

1. 什么是 ufunc

ufunc（universal function）是一种对数组进行逐元素操作的函数，支持：

广播机制
类型自动转换（type coercion）
向量化操作（在 C 级别循环，避免 Python 开销）

import numpy as np

# 内置的 ufunc 示例
arr = np.array([1, 2, 3, 4])
print(np.sqrt(arr))      # 逐元素平方根
print(np.add(arr, 5))    # 逐元素加法
print(np.multiply(arr, arr))  # 逐元素乘法

2. ufunc 的实现架构

2.1 核心数据结构

// C 层的关键结构（简化示意）
typedef struct {
    PyObject_HEAD

    // ufunc 元信息
    int nin;            // 输入参数个数
    int nout;           // 输出参数个数
    int nargs;          // nin + nout

    // 核心函数指针
    PyUFuncGenericFunction *functions;      // 类型特定的核心计算函数
    void **data;                           // 每个类型特定函数的附加数据
    int ntypes;                            // 支持的数据类型数量

    // 类型处理
    char *types;                          // 类型签名数组
    char *name;                           // ufunc 名称

    // 文档和元数据
    char *doc;

} PyUFuncObject;

2.2 执行流程

Python 调用 np.add(arr1, arr2)
    ↓
1. 参数解析和验证
    ↓
2. 广播检查（broadcasting）
    ↓
3. 类型解析（type resolution）
    ↓
4. 内存分配（输出数组）
    ↓
5. 调用 C 层循环分发器
    ↓
6. 执行类型特定的内核函数
    ↓
7. 返回结果

3. 内核函数（Kernels）的实现

3.1 简单的逐元素加法内核（C 示例）

// 双精度浮点数的加法内核
static void double_add(char **args, npy_intp *dimensions,
                      npy_intp *steps, void *extra) {
    npy_intp i, n = dimensions[0];
    char *in1 = args[0], *in2 = args[1], *out = args[2];
    npy_intp in1_step = steps[0], in2_step = steps[1], out_step = steps[2];

    for (i = 0; i < n; i++) {
        // 类型安全的指针操作
        double x = *(double *)in1;
        double y = *(double *)in2;
        *(double *)out = x + y;

        // 移动到下一个元素
        in1 += in1_step;
        in2 += in2_step;
        out += out_step;
    }
}

// 单精度浮点数版本
static void float_add(char **args, npy_intp *dimensions,
                     npy_intp *steps, void *extra) {
    // 类似实现，使用 float 类型
}

3.2 步幅（strides）处理

steps 数组存储每个数组的步幅（字节）
支持非连续内存（转置、切片等）
支持广播（某些维度的步幅为 0）

4. 创建自定义 ufunc

4.1 使用 `np.frompyfunc`

import numpy as np

# 创建标量函数
def my_func(x, y):
    """自定义的逐元素函数"""
    return x ** 2 + y ** 2

# 转换为 ufunc（返回 Python 对象，效率较低）
my_ufunc = np.frompyfunc(my_func, 2, 1)

arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
result = my_ufunc(arr1, arr2)  # [17, 29, 45]

4.2 使用 `np.vectorize`

# 提供更好的类型控制和性能优化
@np.vectorize
def my_vectorized_func(x, y):
    return np.sin(x) * np.cos(y)

# 指定签名以获得更好的性能
@np.vectorize('float64(float64, float64)', signature='()->()')
def optimized_func(x):
    return x * np.exp(-x)

4.3 使用 Numba 创建高性能 ufunc

import numba
import numpy as np

@numba.vectorize(['float64(float64, float64)',
                  'float32(float32, float32)'])
def numba_add(x, y):
    return x + y

# 编译成机器码，性能接近原生 NumPy
arr1 = np.random.randn(10000)
arr2 = np.random.randn(10000)
result = numba_add(arr1, arr2)

5. 完整示例：创建自定义 ufunc

5.1 简单的 C 扩展实现

# setup.py
from distutils.core import setup, Extension
import numpy as np

module = Extension('myufunc',
                   sources=['myufunc.c'],
                   include_dirs=[np.get_include()])

setup(ext_modules=[module])

// myufunc.c
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <Python.h>
#include <numpy/arrayobject.h>
#include <numpy/ufuncobject.h>

// 内核函数：计算 hypot = sqrt(x² + y²)
static void double_hypot(char **args, npy_intp *dimensions,
                        npy_intp *steps, void *data) {
    npy_intp i, n = dimensions[0];
    char *in1 = args[0], *in2 = args[1], *out = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os = steps[2];

    for (i = 0; i < n; i++) {
        double x = *(double *)in1;
        double y = *(double *)in2;
        *(double *)out = sqrt(x*x + y*y);
        in1 += is1; in2 += is2; out += os;
    }
}

static PyMethodDef module_methods[] = {
    {NULL, NULL, 0, NULL}
};

// 模块初始化
static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "myufunc",
    NULL,
    -1,
    module_methods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC PyInit_myufunc(void) {
    PyObject *m, *hypot_ufunc, *dtypes[2];
    PyUFuncGenericFunction hypot_funcs[1];
    void *hypot_data[1];
    char hypot_signature[3] = {NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE};

    import_array();
    import_umath();

    m = PyModule_Create(&moduledef);
    if (!m) return NULL;

    // 注册 hypot ufunc
    hypot_funcs[0] = double_hypot;
    hypot_data[0] = NULL;

    // 创建 ufunc 对象
    hypot_ufunc = PyUFunc_FromFuncAndData(
        hypot_funcs, hypot_data,
        hypot_signature, 1,  // 1个类型循环
        2, 1,                // 2个输入，1个输出
        PyUFunc_None,        // 标识
        "hypot",             // 名称
        "custom hypot function (sqrt(x² + y²))",  // 文档
        0                    // 未使用
    );

    // 添加到模块
    PyModule_AddObject(m, "hypot", hypot_ufunc);
    return m;
}

6. 性能优化技巧

6.1 避免类型检查开销

# 使用 numba 编译 ufunc
@numba.guvectorize(['void(float64[:], float64[:], float64[:])'], 
                   '(n),(n)->(n)', nopython=True)
def fast_operation(x, y, out):
    for i in range(x.shape[0]):
        out[i] = x[i] * y[i] + np.sin(x[i])

6.2 使用 SIMD 指令

// 使用 SIMD 内在函数的示例（AVX2）
#include <immintrin.h>

void avx2_add(const double *a, const double *b, double *c, int n) {
    int i;
    for (i = 0; i <= n - 4; i += 4) {
        __m256d va = _mm256_loadu_pd(a + i);
        __m256d vb = _mm256_loadu_pd(b + i);
        __m256d vc = _mm256_add_pd(va, vb);
        _mm256_storeu_pd(c + i, vc);
    }
    // 处理剩余元素
}

7. 特殊 ufunc 特性

7.1 约简（Reduction）操作

arr = np.array([[1, 2, 3], [4, 5, 6]])

# ufunc 的 reduce 方法
print(np.add.reduce(arr, axis=0))     # [5, 7, 9]
print(np.multiply.reduce(arr, axis=1)) # [6, 120]

# accumulate（累积）
print(np.add.accumulate(arr, axis=0))
# [[1, 2, 3],
#  [5, 7, 9]]

7.2 外积（Outer）操作

a = np.array([1, 2, 3])
b = np.array([4, 5, 6, 7])

result = np.multiply.outer(a, b)
# 4x3 矩阵：a[i] * b[j]

8. 调试和性能分析

import numpy as np

# 检查 ufunc 信息
ufunc = np.add
print(f"名称: {ufunc.__name__}")
print(f"文档: {ufunc.__doc__[:100]}...")
print(f"输入数: {ufunc.nin}")
print(f"输出数: {ufunc.nout}")

# 性能对比
import timeit

# Python 循环 vs ufunc
def python_sum(arr):
    result = 0
    for x in arr:
        result += x
    return result

arr = np.random.randn(1000000)

t1 = timeit.timeit(lambda: python_sum(arr), number=10)
t2 = timeit.timeit(lambda: np.sum(arr), number=10)

print(f"Python 循环: {t1:.3f}s")
print(f"NumPy ufunc: {t2:.3f}s")