diff --git a/01_data_exploration_and_visualization.ipynb b/01_data_exploration_and_visualization.ipynb new file mode 100644 index 0000000..3296599 --- /dev/null +++ b/01_data_exploration_and_visualization.ipynb @@ -0,0 +1,368 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 农业数据集探索与可视化分析\n", + "\n", + "本 Notebook 演示如何加载和分析作物病害、气象、产量等多类数据集,并进行可视化展示。\n", + "\n", + "**场景:** 数据分析 / 可视化\n", + "\n", + "**数据集:**\n", + "- CSV: 作物病害标注表\n", + "- TSV: 农业气象数据\n", + "- Parquet: 作物产量统计" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 环境准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "import numpy as np\n", + "import json\n", + "import os\n", + "\n", + "# 设置中文字体\n", + "matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'WenQuanYi Micro Hei', 'DejaVu Sans']\n", + "matplotlib.rcParams['axes.unicode_minus'] = False\n", + "\n", + "# 数据路径\n", + "DATA_DIR = os.path.dirname(os.path.abspath(''))\n", + "print('数据目录:', DATA_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. 加载 CSV - 作物病害标注表" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 加载 CSV 数据\n", + "df_csv = pd.read_csv(f'{DATA_DIR}/csv/作物病害标注表.csv')\n", + "print(f'数据集大小: {df_csv.shape}')\n", + "df_csv.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 数据基本信息\n", + "print('=== 数据类型 ===')\n", + "print(df_csv.dtypes)\n", + "print('\\n=== 缺失值统计 ===')\n", + "print(df_csv.isnull().sum())\n", + "print('\\n=== 基本统计 ===')\n", + "df_csv.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 作物病害分布分析" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", + "\n", + "# 3.1 各作物病害记录数\n", + "crop_counts = df_csv['作物'].value_counts()\n", + "axes[0, 0].bar(crop_counts.index, crop_counts.values, color=plt.cm.Set3.colors[:len(crop_counts)])\n", + "axes[0, 0].set_title('各作物病害记录数')\n", + "axes[0, 0].set_xlabel('作物')\n", + "axes[0, 0].set_ylabel('记录数')\n", + "axes[0, 0].tick_params(axis='x', rotation=45)\n", + "\n", + "# 3.2 严重程度分布(饼图)\n", + "severity_counts = df_csv['严重程度'].value_counts()\n", + "axes[0, 1].pie(severity_counts.values, labels=severity_counts.index, autopct='%1.1f%%',\n", + " colors=plt.cm.RdYlGn_r(np.linspace(0.1, 0.9, len(severity_counts))))\n", + "axes[0, 1].set_title('病害严重程度分布')\n", + "\n", + "# 3.3 各地区病害记录分布\n", + "region_counts = df_csv['地区'].value_counts()\n", + "axes[1, 0].barh(region_counts.index, region_counts.values, color=plt.cm.Paired.colors[:len(region_counts)])\n", + "axes[1, 0].set_title('各地区病害记录分布')\n", + "axes[1, 0].set_xlabel('记录数')\n", + "\n", + "# 3.4 温度与湿度散点图\n", + "colors = {'轻微': 'green', '轻度': 'yellowgreen', '中度': 'orange', '重度': 'red', '极重度': 'darkred'}\n", + "for sev, color in colors.items():\n", + " mask = df_csv['严重程度'] == sev\n", + " axes[1, 1].scatter(df_csv.loc[mask, '温度_℃'], df_csv.loc[mask, '湿度_%'],\n", + " c=color, label=sev, alpha=0.6, s=20)\n", + "axes[1, 1].set_title('温度与湿度关系(按严重程度着色)')\n", + "axes[1, 1].set_xlabel('温度 (℃)')\n", + "axes[1, 1].set_ylabel('湿度 (%)')\n", + "axes[1, 1].legend(fontsize=8)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('作物病害分析图表.png', dpi=150, bbox_inches='tight')\n", + "plt.show()\n", + "print('图表已保存')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. 加载 TSV - 农业气象数据分析" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 加载 TSV 数据\n", + "df_weather = pd.read_csv(f'{DATA_DIR}/tsv/农业气象数据.tsv', sep='\\t')\n", + "df_weather['日期'] = pd.to_datetime(df_weather['日期'])\n", + "print(f'气象数据集大小: {df_weather.shape}')\n", + "df_weather.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", + "\n", + "# 按月平均温度趋势\n", + "df_weather['月份'] = df_weather['日期'].dt.month\n", + "monthly_temp = df_weather.groupby('月份')['平均温度_℃'].agg(['mean', 'min', 'max'])\n", + "axes[0, 0].plot(monthly_temp.index, monthly_temp['mean'], 'r-o', label='平均温度')\n", + "axes[0, 0].fill_between(monthly_temp.index, monthly_temp['min'], monthly_temp['max'], alpha=0.2, color='red')\n", + "axes[0, 0].set_title('月度温度趋势')\n", + "axes[0, 0].set_xlabel('月份')\n", + "axes[0, 0].set_ylabel('温度 (℃)')\n", + "axes[0, 0].legend()\n", + "axes[0, 0].grid(True, alpha=0.3)\n", + "\n", + "# 月度降水量\n", + "monthly_rain = df_weather.groupby('月份')['降水量_mm'].sum()\n", + "axes[0, 1].bar(monthly_rain.index, monthly_rain.values, color='steelblue', alpha=0.8)\n", + "axes[0, 1].set_title('月度累计降水量')\n", + "axes[0, 1].set_xlabel('月份')\n", + "axes[0, 1].set_ylabel('降水量 (mm)')\n", + "\n", + "# 湿度分布直方图\n", + "axes[1, 0].hist(df_weather['相对湿度_%'], bins=30, color='teal', alpha=0.7, edgecolor='white')\n", + "axes[1, 0].axvline(df_weather['相对湿度_%'].mean(), color='red', linestyle='--', label=f'均值: {df_weather[\"相对湿度_%\"].mean():.1f}%')\n", + "axes[1, 0].set_title('相对湿度分布')\n", + "axes[1, 0].set_xlabel('湿度 (%)')\n", + "axes[1, 0].legend()\n", + "\n", + "# 天气状况统计\n", + "weather_counts = df_weather['天气状况'].value_counts()\n", + "axes[1, 1].pie(weather_counts.values, labels=weather_counts.index, autopct='%1.1f%%',\n", + " colors=plt.cm.Set2.colors[:len(weather_counts)])\n", + "axes[1, 1].set_title('天气状况分布')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. 加载 Parquet - 作物产量分析" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 加载 Parquet 数据\n", + "df_yield = pd.read_parquet(f'{DATA_DIR}/parquet/作物产量统计.parquet')\n", + "print(f'产量数据集大小: {df_yield.shape}')\n", + "df_yield.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", + "\n", + "# 各作物平均亩产\n", + "crop_yield = df_yield.groupby('作物')['亩产_公斤'].mean().sort_values(ascending=False)\n", + "axes[0, 0].bar(crop_yield.index, crop_yield.values, color=plt.cm.tab10.colors[:len(crop_yield)])\n", + "axes[0, 0].set_title('各作物平均亩产')\n", + "axes[0, 0].set_ylabel('亩产 (公斤)')\n", + "axes[0, 0].tick_params(axis='x', rotation=45)\n", + "\n", + "# 年份产量趋势\n", + "yearly = df_yield.groupby('年份')['亩产_公斤'].mean()\n", + "axes[0, 1].plot(yearly.index, yearly.values, 'b-s', markersize=8)\n", + "axes[0, 1].set_title('年度平均亩产趋势')\n", + "axes[0, 1].set_ylabel('亩产 (公斤)')\n", + "axes[0, 1].grid(True, alpha=0.3)\n", + "\n", + "# 灌溉方式对产量的影响\n", + "irrigation = df_yield.groupby('灌溉方式')['亩产_公斤'].mean().sort_values()\n", + "axes[1, 0].barh(irrigation.index, irrigation.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])\n", + "axes[1, 0].set_title('不同灌溉方式平均亩产')\n", + "axes[1, 0].set_xlabel('亩产 (公斤)')\n", + "\n", + "# 自然灾害对产量影响\n", + "disaster = df_yield.groupby('自然灾害')['亩产_公斤'].mean().sort_values()\n", + "axes[1, 1].bar(disaster.index, disaster.values, color=['#2ecc71' if x == '无' else '#e74c3c' for x in disaster.index])\n", + "axes[1, 1].set_title('自然灾害对亩产影响')\n", + "axes[1, 1].set_ylabel('亩产 (公斤)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. 加载 COCO JSON - 标注数据分析" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 加载 COCO 格式标注\n", + "with open(f'{DATA_DIR}/json/作物病害检测_COCO格式.json', 'r', encoding='utf-8') as f:\n", + " coco = json.load(f)\n", + "\n", + "print(f'图片数量: {len(coco[\"images\"])}')\n", + "print(f'标注数量: {len(coco[\"annotations\"])}')\n", + "print(f'类别数量: {len(coco[\"categories\"])}')\n", + "print('\\n类别列表:')\n", + "for cat in coco['categories']:\n", + " count = sum(1 for a in coco['annotations'] if a['category_id'] == cat['id'])\n", + " print(f' {cat[\"name\"]}: {count} 个标注')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# COCO 标注统计可视化\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", + "\n", + "# 各类别标注数量\n", + "cat_names = {c['id']: c['name'] for c in coco['categories']}\n", + "cat_counts = {}\n", + "for a in coco['annotations']:\n", + " name = cat_names[a['category_id']]\n", + " cat_counts[name] = cat_counts.get(name, 0) + 1\n", + "\n", + "axes[0].bar(cat_counts.keys(), cat_counts.values(), color=plt.cm.Paired.colors[:len(cat_counts)])\n", + "axes[0].set_title('各类别标注数量')\n", + "axes[0].tick_params(axis='x', rotation=45)\n", + "\n", + "# 每张图标注数量分布\n", + "img_ann_counts = {}\n", + "for a in coco['annotations']:\n", + " img_ann_counts[a['image_id']] = img_ann_counts.get(a['image_id'], 0) + 1\n", + "counts = list(img_ann_counts.values())\n", + "axes[1].hist(counts, bins=range(1, max(counts)+2), color='steelblue', alpha=0.7, edgecolor='white')\n", + "axes[1].set_title('每张图标注数量分布')\n", + "axes[1].set_xlabel('标注数量')\n", + "axes[1].set_ylabel('图片数')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. 数据质量报告" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('=' * 60)\n", + "print(' 农业数据集质量报告')\n", + "print('=' * 60)\n", + "print(f'\\n📋 CSV 作物病害标注表:')\n", + "print(f' 记录数: {len(df_csv)}')\n", + "print(f' 字段数: {len(df_csv.columns)}')\n", + "print(f' 缺失值: {df_csv.isnull().sum().sum()}')\n", + "print(f' 作物种类: {df_csv[\"作物\"].nunique()} 种')\n", + "print(f' 病害类型: {df_csv[\"病害名称\"].nunique()} 种')\n", + "\n", + "print(f'\\n📋 TSV 农业气象数据:')\n", + "print(f' 记录数: {len(df_weather)}')\n", + "print(f' 字段数: {len(df_weather.columns)}')\n", + "print(f' 时间范围: {df_weather[\"日期\"].min()} ~ {df_weather[\"日期\"].max()}')\n", + "\n", + "print(f'\\n📋 Parquet 作物产量统计:')\n", + "print(f' 记录数: {len(df_yield)}')\n", + "print(f' 字段数: {len(df_yield.columns)}')\n", + "print(f' 年份范围: {df_yield[\"年份\"].min()} ~ {df_yield[\"年份\"].max()}')\n", + "\n", + "print(f'\\n📋 COCO 作物病害检测标注:')\n", + "print(f' 图片数: {len(coco[\"images\"])}')\n", + "print(f' 标注数: {len(coco[\"annotations\"])}')\n", + "print(f' 类别数: {len(coco[\"categories\"])}')\n", + "print('=' * 60)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file