更新时间: 试题数量: 购买人数: 提供作者:

有效期: 个月

章节介绍: 共有个章节

收藏
搜索
题库预览
import pandas as pd import numpy as np import matplotlib.pyplot as plt # 读取数据集 data = pd.read_csv('sensor_data.csv') # 1. 传感器数据统计 # 对传感器类型进行分组,并计算每个组的数据数量和平均值 sensor_stats = data.groupby('SensorType')['Value'].agg(['count','mean']) # 输出结果 print("传感器数据数量和平均值:") print(sensor_stats) # 2. 按位置统计温度和湿度数据 # 筛选出温度和湿度数据,然后按位置和传感器类型分组,计算每个组的平均值 location_stats=data[data['SensorType'].isin(['Temperature','Humidity'])].groupby(['Location','SensorType'])['Value'].mean().unstack() # 输出结果 print("每个位置的温度和湿度数据平均值:") print(location_stats) # 3. 数据清洗和异常值处理 # 标记异常值 data['is_abnormal'] = np.where((data['SensorType'] == 'Temperature') & ((data['Value'] < -10) | (data['Value'] > 50))) | ((data['SensorType'] == 'Humidity') & ((data['Value'] < 0) | (data['Value'] > 100))), True, False) # 输出异常值数量 print("异常值数量:", data['is_abnormal'].sum()) # 填补缺失值 # 使用前向填充和后向填充的方法填补缺失值 data['Value'].fillna(method='ffill', inplace=True) data['Value'].fillna(method='bfill', inplace=True) # 保存清洗后的数据 # 删除用于标记异常值的列,并将清洗后的数据保存到新的CSV文件中 cleaned_data = data.drop(columns=['is_abnormal']) cleaned_data.to_csv('cleaned_sensor_data.csv', index=False) print("数据清洗完成,已保存为 'cleaned_sensor_data.csv'")【缺少答案,请补充】
import pandas as pd # 读取数据集 data = pd.read_csv('credit_data.csv') # 1. 数据完整性审核 missing_values = data.isnull().sum() # 数据缺失值统计2分 duplicate_values = data.duplicated().sum() # 数据重复值统计2分 # 输出结果 print("缺失值统计:") print(missing_values) print("重复值统计:") print(duplicate_values) # 2. 数据合理性审核 data['is_age_valid'] = data['Age'].between(18, 70) # Age数据的合理性审核2分 data['is_income_valid'] = data['Income'] > 2000 # Income数据的合理性审核2分 data['is_loan_amount_valid'] = data['LoanAmount'] < (data['Income'] * 5) # LoanAmount数据的合理性审核2分 data['is_credit_score_valid'] = data['CreditScore'].between(300, 850) # CreditScore数据的合理性审核2分 # 合理性检查结果 validity_checks=data[['is_age_valid','is_income_valid','is_loan_amount_valid','is_credit_score_valid']].all(axis=1) data['is_valid'] = validity_checks # 输出结果 print("数据合理性检查:") print(data[['is_age_valid','is_income_valid','is_loan_amount_valid','is_credit_score_valid','is_valid']].describe()) # 3. 数据清洗和异常值处理 # 标记不合理数据 invalid_rows = data[~data['is_valid']] # 删除不合理数据行 cleaned_data = data[data['is_valid']] # 删除标记列 cleaned_data = cleaned_data.drop(columns=['is_age_valid','is_income_valid','is_loan_amount_valid','is_credit_score_valid','is_valid']) # 保存清洗后的数据 cleaned_data.to_csv('cleaned_credit_data.csv', index=False) print("数据清洗完成,已保存为 'cleaned_credit_data.csv'")【缺少答案,请补充】
import pandas as pd #加载数据集并指定编码为gbk data = pd.read_csv('medical_data.csv', encoding='gbk') # 查看数据类型 print(data.dtypes) # 查看表结构 print(data.info()) # 显示每一列的空缺值数量 print(data.isnull().sum()) # 规范日期格式 data['就诊日期'] = pd.to_datetime(data['就诊日期']) data['诊断日期'] = pd.to_datetime(data['诊断日期']) # 重命名列 data.rename(columns={'病人ID': '患者ID'}, inplace=True) # 查看修改后的表结构 print(data.head()) from datetime import datetime # 增加诊断延迟和病程 data['诊断延迟'] = (data['诊断日期'] - data['就诊日期']).dt.days data['病程'] = (datetime(2024, 9, 1) - data['诊断日期']).dt.days # 删除不合理的数据 data = data[(data['诊断延迟'] >= 0) & (data['年龄'] > 0) & (data['年龄'] < 120)] # 查看修改后的数据 print(data.describe()) # 删除重复值并记录删除的行数 initial_rows = data.shape[0] data.drop_duplicates(inplace=True) deleted_rows = initial_rows - data.shape[0] print(f'删除的重复行数:{deleted_rows}') from sklearn.preprocessing import MinMaxScaler # 对需要归一化的列进行处理 scaler = MinMaxScaler() columns_to_normalize = ['年龄', '体重', '身高'] data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize]) # 查看归一化后的数据 print(data.head()) import matplotlib.pyplot as plt import matplotlib.font_manager as fm # 统计治疗结果分布 treatment_outcome_distribution = data.groupby('疾病类型')['治疗结果'].value_counts().unstack() # 设置中文字体 #font_path = 'C:/Windows/Fonts/simhei.ttf' # 根据你的系统调整字体路径 font_path = '/System/Library/Fonts/Hiragino Sans GB.ttc' # 根据你的系统调整字体路径 my_font = fm.FontProperties(fname=font_path) # 绘制柱状图 treatment_outcome_distribution.plot(kind='bar', stacked=True) plt.title('不同疾病类型的治疗结果分布', fontproperties=my_font) plt.xlabel('疾病类型', fontproperties=my_font) plt.ylabel('治疗结果数量', fontproperties=my_font) plt.xticks(fontproperties=my_font) # 设置x轴刻度标签的字体 plt.yticks(fontproperties=my_font) # 设置y轴刻度标签的字体 plt.legend(prop=my_font) # 设置图例字体 plt.show() # 绘制散点图 plt.scatter(data['年龄'], data['疾病严重程度']) plt.title('年龄和疾病严重程度的关系', fontproperties=my_font) plt.xlabel('年龄', fontproperties=my_font) plt.ylabel('疾病严重程度', fontproperties=my_font) plt.xticks(fontproperties=my_font) # 设置x轴刻度标签的字体 plt.yticks(fontproperties=my_font) # 设置y轴刻度标签的字体 plt.legend(prop=my_font) # 设置图例字体 plt.show() # 保存数据 output_path = '2.1.4_cleaned_data.csv' data.to_csv(output_path, index=False)【缺少答案,请补充】
import pandas as pd print(data_info) # 显示每一列的空缺值数量 print(data.isnull().sum()) # 删除含有缺失值的行 data_cleaned = data.dropna() # 转换 'Your age' 列的数据类型为整数类型,并处理异常值 data_cleaned.loc[:, 'Your age'] = pd.to_numeric(data_cleaned['Your age'], errors='coerce') data_cleaned = data_cleaned.dropna(subset=['Your age']) data_cleaned = data_cleaned[data_cleaned['Your age'] >= 0] data_cleaned.loc[:, 'Your age'] = data_cleaned['Your age'].astype(int) print(data_cleaned['Your age'].dtype) # 检查和删除重复值 duplicates_removed = data_cleaned.duplicated().sum() data_cleaned = data_cleaned.drop_duplicates() print(f"Removed {duplicates_removed} duplicate rows") from sklearn.preprocessing import LabelEncoder # 归一化 'How do you describe your current level of fitness?' 列 label_encoder = LabelEncoder() data_cleaned['How do you describe your current level of fitness?'] = label_encoder.fit_transform(data_cleaned['How do you describe your current level of fitness?']) print(data_cleaned['How do you describe your current level of fitness?'].unique()) from sklearn.preprocessing import LabelEncoder import matplotlib.pyplot as plt # 去掉列名中的空格 data.columns = data.columns.str.strip() # 显示数据集的列名 print(data.columns) # 删除包含缺失值的行 data_cleaned = data.dropna(subset=['How often do you exercise?']) # 统计不同健身频率的分布情况 exercise_frequency_counts = data_cleaned['How often do you exercise?'].value_counts() # 绘制饼图 plt.figure(figsize=(10, 6)) exercise_frequency_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors) plt.title('Distribution of Exercise Frequency') plt.ylabel('') plt.show() import pandas as pd from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt # 填充缺失值 data_filled = data.apply(lambda x: x.fillna(x.mode()[0])) # 划分数据 train_data, test_data = train_test_split(data_filled, test_size=0.2, random_state=42) # 保存数据 cleaned_file_path = '2.1.5_cleaned_data.csv' data_filled.to_csv(cleaned_file_path, index=False)【缺少答案,请补充】
2.2.1智能信用评分Logistic回归模型开发与测试 (1)代码: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression import pickle from sklearn.metrics import classification_report from imblearn.over_sampling import SMOTE # 加载数据 data = pd.read_csv('finance数据集.csv') # 显示前五行的数据 print(data.head()) # 选择自变量和因变量 X = data.drop(['SeriousDlqin2yrs', 'Unnamed: 0'], axis=1) y = data['SeriousDlqin2yrs'] # 分割训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 训练Logistic回归模型 model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) # 保存模型 with open('2.2.1_model.pkl', 'wb') as file: pickle.dump(model, file) # 预测并保存结果 y_pred = model.predict(X_test) pd.DataFrame(y_pred, columns=['预测结果']).to_csv('2.2.1_results.txt', index=False) # 生成测试报告 report = classification_report(y_test, y_pred, zero_division=1) with open('2.2.1_report.txt', 'w') as file: file.write(report) # 分析测试结果 accuracy = (y_test == y_pred).mean() print(f"模型准确率:{accuracy:.2f}") # 处理数据不平衡 smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X_train, y_train) # 重新训练模型 model.fit(X_resampled, y_resampled) # 重新预测 y_pred_resampled = model.predict(X_test) # 保存新结果 pd.DataFrame(y_pred_resampled, columns=['预测结果']).to_csv('2.2.1_results_xg.txt', index=False) # 生成新的测试报告 report_resampled = classification_report(y_test, y_pred_resampled, zero_division=1) with open('2.2.1_report_xg.txt', 'w') as file: file.write(report_resampled) # 分析新的测试结果 accuracy_resampled = (y_test == y_pred_resampled).mean() print(f"重新采样后的模型准确率:{accuracy_resampled:.2f}") (2)Logistic模型(说明:只要小数点后第一位正确即可得满分) 模型性能 precision recall f1-score support 0(没有严重逾期) 0.95 0.99 0.97 26779 1(有严重逾期) 0.57 0.14 0.22 1737 错误分析 0(没有严重逾期): 模型对正常客户识别准确,极少出现误判,但存在少量误将风险客户误判为正常的情况。 1(有严重逾期): 大多数高风险客户被错误分类为正常客户,模型对该类识别能力差,主要由于数据极度不平衡导致。 改进建议 使用SMOTE过采样方法平衡类别分布; 更换为XGBoost非线性模型以提升召回率表现【缺少答案,请补充】
1