segmentfault · iLern · Sep 15, 2020 · Sep 15, 2020 · Sep 15, 2020
diff --git a/5 环境污染的预测/README.md b/5 环境污染的预测/README.md
@@ -3,3 +3,4 @@
 近年来，随着人工智能、大数据、云计算等技术的成熟，环保领域正向智能化转变，因为 AI 的加入环境保护充满了无限的想象，而其中环境污染预测就是重要一环——利用环境传感器、智能摄像头、环保设备 IoT 化，我们可以实现对空气、水源、噪声等污染情况进行实时监测及预警，做到“未病先知，未病先治”，进一步改善环境污染。
 
 请以“环境污染的预测”为主题，利用人工智能技术完成一款产品（软件或硬件）的开发。
+
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Figure_1.png b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Figure_1.png
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Figure_2.png b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Figure_2.png
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Normalization.py b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/Normalization.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import matplotlib as mpl
+import seaborn as sns
+import csv
+
+from sklearn import preprocessing
+from matplotlib import pyplot as plt
+
+def Normalization():
+    data_file = "..\PRSA2017_Data_20130301-20170228\PRSA_Data_20130301-20170228\simpleDataSet.csv"
+    pd_data = pd.read_csv(data_file)
+    sam = []
+    a = ["PM10", "SO2","NO2","CO","PRES","DEWP"]
+    for i in a:
+        y = pd_data.loc[:, i]
+        ys = list(preprocessing.scale(y))
+        sam.append(ys)
+
+    print(len(sam))
+    with open('eth2.csv', 'w') as file:
+        writer = csv.writer(file)
+        for i in range(len(sam[0])):
+            writer.writerow([sam[0][i],sam[1][i],sam[2][i],sam[3][i],sam[4][i],sam[5][i]])
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/README.md b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/README.md
@@ -0,0 +1,36 @@
+# 基于线性回归的PM2.5的预测模型
+
+## 作品介绍
+
+### 背景
+
+随着经济社会发展，空气污染情况成为我们不得不关注的问题，其中Pm2.5为重要的空气质量评价指标，本项目使用`sklearn`进行线性回归建模，并进行预测。
+
+### 截图
+
+![](./Figure_2.png)
+
+经过绘图发现，PM2.5和空气中$SO_2$、$NO_2$、$CO$, 以及气压、露点温度近似成线性关系，使用sklearn中的线性回归模型进行训练拟合并预测
+
+![](./Figure_1.png)
+
+取数据中的$20\%$作为测试数据验证模型的可靠性，如上图所示。
+
+**数据来源**：
+
+[UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets.php?format=mat&task=&att=&area=phys&numAtt=10to100&numIns=&type=ts&sort=attUp&view=list)
+
+## 团队：今天睡醒了吗
+
+**成员**：iLern
+
+**联系方式**：
+
+> email: [email protected]
+>
+> qq: 416138794
+
+## 使用到的AWS技术
+
+`Amazon SageMaker`
+
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/Normalization.cpython-38.pyc b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/Normalization.cpython-38.pyc
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/display_lr.cpython-38.pyc b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/display_lr.cpython-38.pyc
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/predict.cpython-38.pyc b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/__pycache__/predict.cpython-38.pyc
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/display_lr.py b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/display_lr.py
@@ -0,0 +1,21 @@
+import pandas as pd
+import matplotlib as mpl
+import seaborn as sns
+
+from matplotlib import pyplot as plt
+
+# data_file = "..\PRSA2017_Data_20130301-20170228\PRSA_Data_20130301-20170228\PRSA_Data_Aotizhongxin_20130301-20170228.csv"
+data_file = "..\PRSA2017_Data_20130301-20170228\PRSA_Data_20130301-20170228\simpleDataSet.csv"
+
+def display_data():
+    pd_data = pd.read_csv(data_file)
+    print(f'pd_data.head(10) = \n{pd_data.head(10)}')
+    mpl.rcParams['axes.unicode_minus'] = False
+    sns.pairplot(pd_data, 
+                x_vars = ["PM10", "SO2","NO2","CO","PRES","DEWP"],  
+                y_vars = ["PM2.5"], 
+                # dropna = True,
+                kind = "reg", 
+                height = 5,
+                aspect = 0.7)
+    plt.show()
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/eth2.csv b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/eth2.csv
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/main.py b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/main.py
@@ -0,0 +1,9 @@
+import display_lr
+import Normalization
+import predict
+
+if __name__ == "__main__":
+    # display_lr.display_data()
+    # Normalization.Normalization()
+
+    predict.build_lr()
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/predict.ipynb b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/predict.ipynb
@@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-09-13T13:56:40.324016Z",
+     "start_time": "2020-09-13T13:56:36.982915Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib as mpl\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_file = \"..\\PRSA2017_Data_20130301-20170228\\PRSA_Data_20130301-20170228\\simpleDataSet.csv\"\n",
+    "def build_lr():\n",
+    "    pd_data = pd.read_csv(data_file)\n",
+    "\n",
+    "    X = pd_data.loc[:, (\"PM10\", \"SO2\",\"NO2\",\"CO\",\"PRES\",\"DEWP\")]\n",
+    "    y = pd_data.loc[:, \"PM2.5\"]\n",
+    "    \n",
+    "    #选择20%为测试集\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=532) \n",
+    "\n",
+    "    print('训练集测试及参数:')\n",
+    "    print(f'X_train.shape={X_train.shape}\\ny_train.shape ={y_train.shape}\\nX_test.shape={X_test.shape}\\ny_test.shape={y_test.shape}')\n",
+    "\n",
+    "    linear_reg = LinearRegression()\n",
+    "    model = linear_reg.fit(X_train, y_train)\n",
+    "\n",
+    "    print('模型参数')\n",
+    "    print(model)\n",
+    "\n",
+    "    print('模型截距')\n",
+    "    print(linear_reg.intercept_)\n",
+    "\n",
+    "    print('参数权重')\n",
+    "    print(linear_reg.coef_)\n",
+    "\n",
+    "    y_pred = linear_reg.predict(X_test)\n",
+    "    sum_mean = 0\n",
+    "\n",
+    "    for i in range(len(y_pred)):\n",
+    "        sum_mean += (y_pred[i] - y_test.values[i]) ** 2\n",
+    "\n",
+    "    sum_erro = np.sqrt(sum_mean / len(y_pred))\n",
+    "\n",
+    "    print(sum_erro)\n",
+    "\n",
+    "    plt.figure()\n",
+    "    plt.figure()\n",
+    "    plt.plot(range(len(y_pred)), y_pred, 'b', label=\"predict\")\n",
+    "    plt.plot(range(len(y_pred)), y_test, 'r', label=\"test\")\n",
+    "    plt.legend(loc=\"upper right\")\n",
+    "    plt.xlabel(\"the number of sales\")\n",
+    "    plt.ylabel('value of sales')\n",
+    "    plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/predict.py b/5 环境污染的预测/pm2.5的线性回归预测模型-今天睡醒了吗/predict.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import matplotlib as mpl
+import seaborn as sns
+import numpy as np
+
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from matplotlib import pyplot as plt
+
+
+data_file = "..\PRSA2017_Data_20130301-20170228\PRSA_Data_20130301-20170228\simpleDataSet.csv"
+def build_lr():
+    pd_data = pd.read_csv(data_file)
+
+    X = pd_data.loc[:, ("PM10", "SO2","NO2","CO","PRES","DEWP")]
+    y = pd_data.loc[:, "PM2.5"]
+
+    #选择20%为测试集
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=532) 
+
+    print('训练集测试及参数:')
+    print(f'X_train.shape={X_train.shape}\ny_train.shape ={y_train.shape}\nX_test.shape={X_test.shape}\ny_test.shape={y_test.shape}')
+
+    linear_reg = LinearRegression()
+    model = linear_reg.fit(X_train, y_train)
+
+    print('模型参数')
+    print(model)
+
+    print('模型截距')
+    print(linear_reg.intercept_)
+
+    print('参数权重')
+    print(linear_reg.coef_)
+
+    y_pred = linear_reg.predict(X_test)
+    sum_mean = 0
+
+    for i in range(len(y_pred)):
+        sum_mean += (y_pred[i] - y_test.values[i]) ** 2
+
+    sum_erro = np.sqrt(sum_mean / len(y_pred))
+
+    print(sum_erro)
+
+    plt.figure()
+    plt.figure()
+    plt.plot(range(len(y_pred)), y_pred, 'b', label="predict")
+    plt.plot(range(len(y_pred)), y_test, 'r', label="test")
+    plt.legend(loc="upper right")
+    plt.xlabel("the number of sales")
+    plt.ylabel('value of sales')
+    plt.show()