我使用一个简单的数据集来计算x和y值之间的线性回归。我用Python和C++编写了代码。然而,由于操作过程中的舍入(我相信),输出差异很大。最低可运行产品:
cpp代码:
void linear_regression1(){
vector<int> independent;
independent.push_back(60300);
independent.push_back(60302);
independent.push_back(60302);
independent.push_back(60290);
independent.push_back(60292);
independent.push_back(60290);
vector<double> dependent_mid;
dependent_mid.push_back(143.5);
dependent_mid.push_back(143.5);
dependent_mid.push_back(143.5);
dependent_mid.push_back(142.5);
dependent_mid.push_back(142.5);
dependent_mid.push_back(142.5);
double sumX=0, sumX2=0, sumY=0, sumXY=0, a, b;
int n = independent.size();
for(int i=0;i<n;i++)
{
sumX = sumX + independent[i];
sumX2 = sumX2 + independent[i]*independent[i];
sumY = sumY + dependent_mid[i];
sumXY = sumXY + independent[i]*dependent_mid[i];
//cout<<sumXY<<endl;
}
/* Calculating a and b */
b = (n*sumXY-sumX*sumY)/(n*sumX2-sumX*sumX);
a = (sumX2*sumY-sumX*sumXY)/(sumX2*n-sumX*sumX);
/* Displaying value of a and b */
cout<<"Equation of best fit is: y = "<< a <<" + "<< b<<"x";
}
python代码:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
data = {'x': [60300, 60302, 60300, 60302, 60290, 60292],
'y_mid': [143.5, 143.5, 143.5, 143.5, 142.5, 142.5]
}
df = pd.DataFrame(data, columns = ['x', 'y_mid'])
X = df['x']
y = df['y_mid']
sumX=0
sumX2=0
sumY=0
sumXY=0
for i in range(len(X)):
sumX = sumX + X[i]
sumX2 = sumX2 + X[i]*X[i]
sumY = sumY + y[i]
sumXY = sumXY + X[i]*y[i]
b = (len(X)*sumXY-sumX*sumY)/(len(X)*sumX2-sumX*sumX)
a = (sumX2*sumY-sumX*sumXY)/(sumX2*len(X)-sumX*sumX)
print(a)
print(b)
我知道Python的结果是正确的
代码中存在整数溢出问题:60'300*60'300=3'636'090'000,而
int
通常为4字节长,只能保存[-2^31,2^31-1]=[-2'147'483'648,2'147'483'647]范围内的值Python的整数不会溢出
相关问题 更多 >
编程相关推荐