Numpy 线性回归实现
import numpy as np
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, learning_rate=0.01, num_iterations=1000, normalize=True):
self.learning_rate = learning_rate
self.num_iterations = num_iterations
self.normalize = normalize
self.weights = None # 模型参数(权重)
self.bias = None # 偏置项
self.mean = None # 用于特征标准化的均值
self.std = None # 用于特征标准化的标准差
def _normalize(self, X):
self.mean = np.mean(X, axis=0) # 计算每个特征的均值和标准差
self.std = np.std(X, axis=0) # 避免除以零
self.std[self.std == 0] = 1 # 标准化
return (X - self.mean) / self.std
def predict(self, X):
"""
X: 输入特征矩阵,形状为(n_samples, n_features)
返回: 预测结果,形状为(n_samples, 1)
"""
if self.normalize:
X = (X - self.mean) / self.std
return np.dot(X, self.weights) + self.bias
def compute_cost(self, X, y):
"""
X: 输入特征矩阵,形状为 (n_samples, n_features)
y: 目标值,形状为 (n_samples, 1)
"""
n_samples = X.shape[0]
y_pred = self.predict(X)
# 计算均方误差
cost = (1 / (2 * n_samples)) * np.sum((y_pred - y) ** 2)
return cost
def gradient_descent(self, X, y):
"""
X: 输入特征矩阵,形状为(n_samples, n_features)
y: 目标值,形状为(n_samples, 1)
"""
n_samples, n_features = X.shape
# 初始化权重和偏置
self.weights = np.zeros((n_features, 1))
self.bias = 0
costs = []
# 梯度下降迭代
for i in range(self.num_iterations):
# 计算预测值
y_pred = self.predict(X)
dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
db = (1 / n_samples) * np.sum(y_pred - y)
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
cost = self.compute_cost(X, y)
costs.append(cost)
if i % 100 == 0:
print(f"Iteration {i}/{self.num_iterations}, Cost: {cost:.4f}")
return costs
def fit(self, X, y):
"""
X: 输入特征矩阵,形状为(n_samples, n_features)
y: 目标值,形状为(n_samples, 1)
self: 训练好的模型
"""
# 确保y是二维数组
if y.ndim == 1:
y = y.reshape(-1, 1)
# 如果需要标准化,对特征进行标准化
if self.normalize:
X = self._normalize(X)
# 执行梯度下降
self.costs = self.gradient_descent(X, y)
return self
# 测试代码
if __name__ == "__main__":
# 生成测试数据
np.random.seed(42) # 设置随机种子,确保结果可复现
n_samples = 1000 # 生成3个特征的数据集
n_features = 3
true_weights = np.array([[2.5], [4.2], [-1.8]]) # 真实的权重和偏置 - 我们希望模型能够学习到这些值
true_bias = 3.7
X = np.random.randn(n_samples, n_features) * 10 # 生成特征 (服从正态分布)
# 生成目标值,加入一些噪声
y = np.dot(X, true_weights) + true_bias + np.random.randn(n_samples, 1) * 5 # 加入噪声
# 划分训练集和测试集
split_idx = int(0.8 * n_samples)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# 创建并训练模型
print("开始训练模型...")
model = LinearRegression(learning_rate=0.01, num_iterations=1000, normalize=True)
model.fit(X_train, y_train)
# 打印学习到的参数
print("\n学习到的参数与真实参数对比:")
print(f"偏置项: {model.bias:.4f} (真实值: {true_bias})")
for i in range(n_features):
print(f"特征 {i+1} 的权重: {model.weights[i][0]:.4f} (真实值: {true_weights[i][0]})")
# 在测试集上进行预测
y_pred = model.predict(X_test)
# 计算测试集上的均方误差和R²分数
mse = np.mean((y_pred - y_test) ** 2)
ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
ss_residual = np.sum((y_test - y_pred) ** 2)
r2 = 1 - (ss_residual / ss_total)
print(f"\n测试集性能:")
print(f"均方误差 (MSE): {mse:.4f}")
print(f"R² 分数: {r2:.4f}") # R²越接近1,模型拟合越好
# 绘制代价函数随迭代次数的变化
plt.figure(figsize=(10, 6))
plt.plot(range(model.num_iterations), model.costs)
plt.title("代价函数随迭代次数的变化")
plt.xlabel("迭代次数")
plt.ylabel("代价函数值")
plt.grid(True)
plt.show()
# 绘制预测值与真实值的对比
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title("预测值 vs 真实值")
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.grid(True)
plt.show()
转载请注明来源 goldandrabbit.github.io