目标
利用matplotlib(v3.2)
,在链家二手房平台爬取二手房信息,并使用最小二乘法进行线性回归模型建模。
求解
爬取并处理数据
参见 https://boyinthesun.cn/post/python-scrapy2/
绘制图表
详见注释,这里还用到了numpy和pandas。
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# 为了显示中文,需要更换字体
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
# 如果你是macOSX,请使用以下
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
path = 'data\\data_tongzhou.csv' # 文件名,请根据情况修改
df = pd.read_csv(path, usecols=['总价', '面积']) # 读取文件中总价和面积两列
# 拟合方法
def fitSLR(x,y):
dinominator = 0
numerator = 0
for i in range(len(x)):
numerator += (x[i] - np.mean(x)) * (y[i] - np.mean(y))
dinominator += (x[i] - np.mean(x)) ** 2
# print("numerator = ", numerator)
# print("dinominator = ", dinominator)
slope = numerator/float(dinominator)
intercept = np.mean(y) - slope * np.mean(x)
# print("slope = ", slope)
# print("intercept = ", intercept)
return slope, intercept
slope, intercept = fitSLR(df['面积'], df['总价']) # 拟合,得到直线的斜率和截距
plt.scatter(df['面积'], df['总价'], color='blue', s=0.2) # 绘制散点,并给它一个我喜欢的颜色和大小。单位为磅,下同
plt.plot(df['面积'], slope * df['面积'] + intercept, color='red', linewidth=1, label='y = {}x + {}'.format(slope, intercept)) # 绘制直线,并给它一个我喜欢的颜色和宽度
plt.xlabel('房屋面积/平方米')
plt.ylabel('房屋总价/万元')
plt.legend(loc='upper left') # 图例位置
plt.title('北京市通州区房屋面积和总价关系预测')
plt.show() # 显示图表
plt.savefig('北京市通州区房屋面积和总价关系预测.png', bbox_inches='tight') # 保存图表