Numpy 线性回归

Coding Linear Regression
Created At : 2018-04-01 20:00
Numpy 线性回归实现
Numpy 线性回归实现

import numpy as np
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000, normalize=True):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.normalize = normalize
        self.weights = None  # 模型参数(权重)
        self.bias = None     # 偏置项
        self.mean = None     # 用于特征标准化的均值
        self.std = None      # 用于特征标准化的标准差

    def _normalize(self, X):
        self.mean = np.mean(X, axis=0)  # 计算每个特征的均值和标准差
        self.std = np.std(X, axis=0)    # 避免除以零
        self.std[self.std == 0] = 1     # 标准化
        return (X - self.mean) / self.std

    def predict(self, X):
        """
        X: 输入特征矩阵，形状为(n_samples, n_features)
        返回: 预测结果，形状为(n_samples, 1)
        """
        if self.normalize:
            X = (X - self.mean) / self.std
        return np.dot(X, self.weights) + self.bias

    def compute_cost(self, X, y):
        """
        X: 输入特征矩阵，形状为 (n_samples, n_features)
        y: 目标值，形状为 (n_samples, 1)
        """
        n_samples = X.shape[0]
        y_pred = self.predict(X)
        # 计算均方误差
        cost = (1 / (2 * n_samples)) * np.sum((y_pred - y) ** 2)
        return cost

    def gradient_descent(self, X, y):
        """
        X: 输入特征矩阵，形状为(n_samples, n_features)
        y: 目标值，形状为(n_samples, 1)
        """
        n_samples, n_features = X.shape

        # 初始化权重和偏置
        self.weights = np.zeros((n_features, 1))
        self.bias = 0

        costs = []

        # 梯度下降迭代
        for i in range(self.num_iterations):
            # 计算预测值
            y_pred = self.predict(X)
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            cost = self.compute_cost(X, y)

            costs.append(cost)
            if i % 100 == 0:
                print(f"Iteration {i}/{self.num_iterations}, Cost: {cost:.4f}")
        return costs

    def fit(self, X, y):
        """
        X: 输入特征矩阵，形状为(n_samples, n_features)
        y: 目标值，形状为(n_samples, 1)
        self: 训练好的模型
        """
        # 确保y是二维数组
        if y.ndim == 1:
            y = y.reshape(-1, 1)

        # 如果需要标准化，对特征进行标准化
        if self.normalize:
            X = self._normalize(X)

        # 执行梯度下降
        self.costs = self.gradient_descent(X, y)
        return self

# 测试代码
if __name__ == "__main__":
    # 生成测试数据
    np.random.seed(42)  # 设置随机种子，确保结果可复现
    n_samples = 1000  # 生成3个特征的数据集
    n_features = 3

    true_weights = np.array([[2.5], [4.2], [-1.8]])  # 真实的权重和偏置 - 我们希望模型能够学习到这些值
    true_bias = 3.7

    X = np.random.randn(n_samples, n_features) * 10 # 生成特征 (服从正态分布)

    # 生成目标值，加入一些噪声
    y = np.dot(X, true_weights) + true_bias + np.random.randn(n_samples, 1) * 5  # 加入噪声

    # 划分训练集和测试集
    split_idx = int(0.8 * n_samples)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # 创建并训练模型
    print("开始训练模型...")
    model = LinearRegression(learning_rate=0.01, num_iterations=1000, normalize=True)
    model.fit(X_train, y_train)

    # 打印学习到的参数
    print("\n学习到的参数与真实参数对比:")
    print(f"偏置项: {model.bias:.4f} (真实值: {true_bias})")
    for i in range(n_features):
        print(f"特征 {i+1} 的权重: {model.weights[i][0]:.4f} (真实值: {true_weights[i][0]})")

    # 在测试集上进行预测
    y_pred = model.predict(X_test)

    # 计算测试集上的均方误差和R²分数
    mse = np.mean((y_pred - y_test) ** 2)
    ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
    ss_residual = np.sum((y_test - y_pred) ** 2)
    r2 = 1 - (ss_residual / ss_total)

    print(f"\n测试集性能:")
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"R² 分数: {r2:.4f}")  # R²越接近1，模型拟合越好

    # 绘制代价函数随迭代次数的变化
    plt.figure(figsize=(10, 6))
    plt.plot(range(model.num_iterations), model.costs)
    plt.title("代价函数随迭代次数的变化")
    plt.xlabel("迭代次数")
    plt.ylabel("代价函数值")
    plt.grid(True)
    plt.show()

    # 绘制预测值与真实值的对比
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.title("预测值 vs 真实值")
    plt.xlabel("真实值")
    plt.ylabel("预测值")
    plt.grid(True)
    plt.show()
转载请注明来源 goldandrabbit.github.io