ML/Kaggle笔记#
记录ml/kaggle各种实际操作
不想引入过多的库,太杂乱了。尽量只用pandas/numpy/sklearn。
其他各种库sns/mlxtend/fuzzywuzzy
https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html
https://scikit-learn.org/stable/user_guide.html
1.概念#
1.1 Dataframe/Series#
import pandas as pd
import numpy as np
df = pd.DataFrame({'Yes':[50, 21], 'No':[131, 2]})
df = pd.Series([1, 2, 3, 4, 5])
# 也可以有名字/index等各种信息
df = pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')
df['ggg']
df.ggg
df.iloc[2] # 取#2行
df.iloc[:, 2] # 取#2列
df.iloc[3:5, 2] # 取#3-5行 #2列
df.iloc[[4, 5 ,9], 2] # 取#4#5#9行 #2列
# iloc vs loc
# https://stackoverflow.com/questions/31593201/how-are-iloc-and-loc-different
# loc选取行列
df.loc[[0,1,10,100], ['country', 'province', 'region_1', 'region_2']]
# 条件选取
print(df.loc[(df.MSZoning == 'RL') & (df.SaleCondition == 'Abnorml') | (df.SalePrice > 400000)])
print(df.loc[(df.SaleType.isin(['COD']))])
# 获取符合条件的index
index = df.index[df.SaleType == 'New']
print(df.index[(df.SaleType == 'New') & (df.SalePrice > 400000)])
print(index)
print(df.loc[index])
# 直接改数据
df.SalePrice = 666
df.SalePrice = df.SalePrice * 10000000
df.SalePrice.replace(208500, 666, inplace = True) # 替换
# 查看某列的简单信息
print(df.SalePrice.describe())
print(df.SalePrice.mean())
print(df.SalePrice.median())
print(df['SaleType'].unique())
print(df.SalePrice.value_counts()) # 每个唯一值的数量
1.2 map/apply#
# 可处理数据。例如给数据转换格式。
# 某一列的值除以10
df.SalePrice = df.SalePrice.map(lambda p: p / 10)
# 修改index之中的行数据
index = df.index[df.SaleType == 'New']
df.loc[index, 'SalePrice'] = df.loc[index, 'SalePrice'].map(lambda p: p * 10)
# 用apply做一样的效果
def deal_data(row):
row.SalePrice *= 100
return row
df = df.apply(deal_data, axis = 1)
# 排序。DataFrame.sort_values
print(df.sort_values(by = 'SalePrice', ascending = False))
# 多列依次排序
print(df.sort_values(by = ['SalePrice', 'MSSubClass']))
1.3 Grouping#
# Grouping
# 按某列的值分组,再计算。
# 默认会用指定的group作为新的index
print(df.groupby('SaleType').SalePrice.sum())
print(df.groupby('SaleType').SalePrice.min())
print(df.groupby('SaleType').SalePrice.max())
# 按1列分组
# apply返回原始数据
with pd.option_context('display.max_rows', None):
print(df.groupby(['SalePrice']).apply(lambda x: x))
# 按2列分组
print(df.groupby(['SaleType', 'SalePrice']).apply(lambda x: x))
# 用函数看每条数据的内容
# 数据是每个分组的所有数据,是个DataFrame,可包含多条数据。
def app(data):
print(type(data))
print(data)
return data
print(df.groupby(['SaleType', 'SalePrice']).apply(app))
# 返回每组里某个列最大的的那个
def app(data):
print(type(data))
print(data)
print(data.SalePrice)
idxmax = pd.to_numeric(data['MSSubClass']).idxmax()
print(f"idxmax {idxmax}")
return df.loc[idxmax]
print(df.groupby(['SaleType', 'SalePrice']).apply(app))
# agg对每组做多个计算并列出
# 参数种函数的写法有待研究
print(df.groupby(['SaleType', 'SalePrice']).MSSubClass.agg([len, 'min', 'max', 'mean']))
# 对多列分组,并agg,并对结果再排序。
df.groupby(['SaleType', 'SalePrice']).MSSubClass.agg([len, 'min', 'max', 'mean']).sort_values(by = ['len', 'mean'])
1.4 数据简单预处理#
# 数据集
# https://en.wikipedia.org/wiki/List_of_datasets_for_machine-learning_research
# https://archive.ics.uci.edu
# https://www.kaggle.com/datasets
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
#url = 'https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction/download?datasetVersionNumber=1'
# 读取csv
# url。有的情况需要额外处理。
df = pd.read_csv(url)
print(df.head())
# https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction?select=heart.csv
# https://www.kaggle.com/competitions/home-data-for-ml-course/data?select=train.csv
# 可直接读zip
df = pd.read_csv('heart.zip')
print(df.head())
# csv
df = pd.read_csv('train.csv')
print(df.head())
# 查看数据类型
print(df.dtypes)
# 缺失数据相关
# https://pandas.pydata.org/docs/user_guide/missing_data.html
# 查看null数据量
# sum默认针对列
missing = df.isnull().sum()
print(missing)
# 针对行
print(df.isnull().sum(axis = 1))
# 打印全部数据
with pd.option_context('display.max_rows', None):
print(missing)
# 打印部分
print(missing[:10])
# 再sum得到总数
print(missing.sum())
# 打印百分比
print(f"missing {missing.sum()/np.product(df.shape):.2%}")
# 获取某列有null的数据
print(df[pd.isnull(df['XXX'])])
# 丢弃包含na的行或列
df.dropna(inplace = True)
df.dropna(axis = 1, inplace = True)
# 填充na
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
# 有多种填充策略
df.fillna(0, inplace = True) # bfill ffill
df.Age.fillna(df.Age.mean(), inplace = True) # 平均
df.Age.fillna(df.Age.mode()[0], inplace = True) # 最常出现
# 填入平均值的讨论
# https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns
# 用sklearn.impute.SimpleImputer填充
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# na填充为已存在值的平均值。mean等只能处理数字类型
# fit_transform直接处理DataFrame,返回ndarray。
# 可转回DataFrame。columns = df.columns保留原始的header。
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values = np.nan, strategy='most_frequent') # most_frequent mean
df = pd.DataFrame(imp_mean.fit_transform(df), columns = df.columns)
# Scaling and Normalization
# https://www.kaggle.com/code/alexisbcook/scaling-and-normalization/tutorial
# Scaling
from sklearn.preprocessing import MinMaxScaler, minmax_scale
# 默认0-1。minmax_scale可直接处理某些列
df[['MSSubClass', 'ggg']] = minmax_scale(df[['MSSubClass', 'ggg']])
# =是引用。copy默认是deep copy。
df2 = df
df2 = df.copy()
# 可获取两个df的diff
print(df.compare(df2))
print(df.equals(df2))
# 不同的库和精度问题
# from mlxtend.preprocessing import minmax_scaling
# 这个minmax_scaling和sklearn的minmax_scale返回的结果可能不同。
pd.set_option("display.precision", 28)
# 显示默认精度时可能df.compare打印出来看上去一样但是df.equals是False。
# 提高显示精度后可以看到确实有区别
# 不同的库计算结果有精度的区别
# Normalization
# https://stackoverflow.com/questions/26414913/normalize-columns-of-a-dataframe
from sklearn.preprocessing import StandardScaler
# 数据转换
# 数据转成datetime。添加新列。
df['date_parsed'] = pd.to_datetime(df['date'], format="%m/%d/%y")
# 统计某一列的字符串长度。有些情况下可以用来区分数据。
data_lengths = df['LandSlope'].str.len()
print(data_lengths.value_counts())
# 转成某种类型
df.SalePrice.astype(int)
# 修改index和列名DataFrame.rename
df.rename(index = {0:'firstEntry', 1:'secondEntry'})
df.rename(columns = {'points': 'score'})
1.5 模型评估#
https://scikit-learn.org/stable/modules/model_evaluation.html
1.6 交叉验证#
https://scikit-learn.org/stable/modules/cross_validation.html
1.7 Mutual Information#
https://www.kaggle.com/code/ryanholbrook/mutual-information
https://scikit-learn.org/stable/modules/feature_selection.html
2 kaggle#
2.1 预测房价#
https://www.kaggle.com/learn/intro-to-machine-learning
# 官方例子
# 读数据
melbourne_data = pd.read_csv
# 丢弃空行
melbourne_data = melbourne_data.dropna(axis = 0)
# 选target
y = melbourne_data.Price
# 选feat
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
# 随机森林
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor(random_state = 1)
melbourne_model.fit(X, y)
# mse
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)
# 读数据
X_train_org = pd.read_csv('train.csv')
X_test_org = pd.read_csv('test.csv')
X_test_all = X_test_org
X_train_all = X_train_org
# 处理target
X_train_all.dropna(axis = 0, subset = ['SalePrice'], inplace = True)
y_train_all = X_train_all['SalePrice']
X_train_all.drop(['SalePrice'], axis = 1, inplace = True)
# 选一些有序列。质量/数量等。
ord_1_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
# 选one hot
onehot_cols = ['LandContour', 'Condition1', 'PavedDrive', 'Electrical', 'MasVnrType', 'Alley', 'MiscFeature', 'BldgType', 'GarageType', 'Heating', 'RoofMatl', 'LotConfig', 'Functional', 'SaleType', 'RoofStyle', 'LandSlope', 'HouseStyle', 'SaleCondition', 'Condition2', 'Foundation', 'LotShape', 'MSZoning']
# 除掉挑选的有序列
onehot_cols = list(set(onehot_cols) - set(ord_cols))
# 数值类型的
numerical_cols = [cname for cname in X_train_all.columns if X_train_all[cname].dtype in ['int64', 'float64']]
numerical_cols.remove('MoSold')
print(f"onehot_cols {onehot_cols}")
print(f"ord_cols {ord_cols}")
print(f"numerical_cols {numerical_cols}")
# 挑选的所有列。有序/onehot/数字
my_cols = onehot_cols + numerical_cols + ord_cols
print(f"my_cols.shape {len(my_cols)}")
# 分割数据
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, train_size = 0.8, random_state = 2)
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test = X_test_all[my_cols].copy()
print(f"X_train {X_train}")
print(f"X_valid {X_valid}")
print(f"X_test {X_test}")
# 处理数字的pipeline
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = 'mean')),
('scaler', StandardScaler())
])
# 处理onehot的pipeline
onehot_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = 'most_frequent')),
('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])
# 处理顺序标签的pipeline
ord_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = 'most_frequent')),
('ord', OrdinalEncoder(#categories = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
handle_unknown = 'use_encoded_value',
unknown_value = -1)) # np.nan
])
# 组成ColumnTransformer。指定pipeline及其处理的列
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('onehot', onehot_transformer, onehot_cols),
('ord', ord_transformer, ord_cols)
])
# 起模型
model = RandomForestRegressor(n_estimators = 150, random_state = 662)
# 组成最终的pipeline
my_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('model', model)])
# 训练
my_pipeline.fit(X_train, y_train)
# 预测
pred = my_pipeline.predict(X_test)
# 输出
output = pd.DataFrame({'Id': X_test_org['Id'], 'SalePrice': pred})
output.to_csv('out.csv', index=False)
后续
添加Mutual Information处理。 折腾了半天已知的feature、OrdinalEncoder等,并没有很大提升。
造feature,例如OverallQual * GrLivArea
。其mi值达到0.757572。结果有巨大提升。排名直接从4万左右到4千多。
2.1 泰坦尼克#
https://www.kaggle.com/competitions/titanic/overview
# Age有100多个missing,填充最常见的值。比丢弃有较多提升。
# sibsp和parch做一个总数,好像没用。
# 看cabin数据可发现有数据的明显生存率高。处理一下好像没用。
# pclass保留有序。不要做成one hot。
# 用nn。分数能到0.7799。排名3470/15286。
# 随机森林0.72488