



M1のMacbook Air (2020)

ProductName:    macOS
ProductVersion: 12.4
BuildVersion:   21F79


conda create -n optuna-shap python=3.9 --file requirements-conda.txt



いろいろなパッケージを書いたが、optunaのベータ版をpipからインストールすることに起因する。pipからインストールせざるを得ないが、できる限りpipとcondaを混在させたくないために、optuna以外をconda installするために指定した結果である。このあと以下のコマンドで仮想環境をactivateし、pipでベータ版のoptuna (v3.0.0b1)をダウンロードした。

conda activate optuna-shap
pip install optuna==3.0.0b1



from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.base import clone
import optuna
from optuna.integration.shap import ShapleyImportanceEvaluator
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from urllib import request
import os
import pandas as pd




# 配布ページ: https://datachemeng.com/pythonassignment/
url = 'https://datachemeng.com/wp-content/uploads/2017/07/logSdataset1290.csv'

dirpath_cache = os.path.abspath('./_cache')
if not os.path.isdir(dirpath_cache):

fpath_csv_cache = os.path.join(dirpath_cache, os.path.basename(url))
if not os.path.isfile(fpath_csv_cache):
    with request.urlopen(url) as response:
        content = response.read().decode('utf-8-sig')
    with open(fpath_csv_cache, 'w', encoding='utf-8-sig') as f:

df_data = pd.read_csv(fpath_csv_cache, index_col=0)
logS MolWt HeavyAtomMolWt ExactMolWt NumValenceElectrons NumRadicalElectrons MaxPartialCharge MinPartialCharge MaxAbsPartialCharge MinAbsPartialCharge ... fr_sulfide fr_sulfonamd fr_sulfone fr_term_acetylene fr_tetrazole fr_thiazole fr_thiocyan fr_thiophene fr_unbrch_alkane fr_urea
CC(N)=O 1.58 59.068 54.028 59.037114 24 0 0.213790 -0.369921 0.369921 0.213790 ... 0 0 0 0 0 0 0 0 0 0
CNN 1.34 46.073 40.025 46.053098 20 0 -0.001725 -0.271722 0.271722 0.001725 ... 0 0 0 0 0 0 0 0 0 0
CC(=O)O 1.22 60.052 56.020 60.021129 24 0 0.299685 -0.481433 0.481433 0.299685 ... 0 0 0 0 0 0 0 0 0 0
C1CCNC1 1.15 71.123 62.051 71.073499 30 0 -0.004845 -0.316731 0.316731 0.004845 ... 0 0 0 0 0 0 0 0 0 0
NC(=O)NO 1.12 76.055 72.023 76.027277 30 0 0.335391 -0.349891 0.349891 0.335391 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 197 columns


SEED = 334


col_target = 'logS'
X = df_data.drop(col_target, axis=1)
y = df_data[col_target]
MolWt HeavyAtomMolWt ExactMolWt NumValenceElectrons NumRadicalElectrons MaxPartialCharge MinPartialCharge MaxAbsPartialCharge MinAbsPartialCharge MaxEStateIndex ... fr_sulfide fr_sulfonamd fr_sulfone fr_term_acetylene fr_tetrazole fr_thiazole fr_thiocyan fr_thiophene fr_unbrch_alkane fr_urea
CC(N)=O 59.068 54.028 59.037114 24 0 0.213790 -0.369921 0.369921 0.213790 9.222222 ... 0 0 0 0 0 0 0 0 0 0
CNN 46.073 40.025 46.053098 20 0 -0.001725 -0.271722 0.271722 0.001725 4.597222 ... 0 0 0 0 0 0 0 0 0 0
CC(=O)O 60.052 56.020 60.021129 24 0 0.299685 -0.481433 0.481433 0.299685 9.000000 ... 0 0 0 0 0 0 0 0 0 0
C1CCNC1 71.123 62.051 71.073499 30 0 -0.004845 -0.316731 0.316731 0.004845 3.222222 ... 0 0 0 0 0 0 0 0 0 0
NC(=O)NO 76.055 72.023 76.027277 30 0 0.335391 -0.349891 0.349891 0.335391 9.229167 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 196 columns



rf = RandomForestRegressor(random_state=SEED, n_jobs=-1)


def objective(trial:optuna.trial.Trial) -> float:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_samples_split': trial.suggest_uniform('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': trial.suggest_uniform('min_samples_leaf', 0.1, 1.0),
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    cv_score = cross_validate(clone(rf).set_params(**params), X, y, cv=kf, scoring='neg_mean_squared_error', return_train_score=False, n_jobs=-1)
    return -cv_score['test_score'].mean()


trial_test = optuna.trial.FixedTrial({
    'n_estimators': 100,
    'max_depth': 5,
    'min_samples_split': 2,
    'min_samples_leaf': 1


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.FixedTrial.suggest_float` instead.
The value 2 of the parameter 'min_samples_split' is out of the range of the distribution FloatDistribution(high=1.0, log=False, low=0.1, step=None).
suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.FixedTrial.suggest_float` instead.




sampler = optuna.samplers.TPESampler(seed=SEED)
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=50)
[I 2022-06-11 17:31:49,556] A new study created in memory with name: no-name-8ae21fd3-ed79-473d-b95f-81af69a35113
suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.
suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.
[I 2022-06-11 17:31:50,207] Trial 0 finished with value: 1.9372632750738048 and parameters: {'n_estimators': 24, 'max_depth': 1, 'min_samples_split': 0.21184083616914012, 'min_samples_leaf': 0.25349329561749007}. Best is trial 0 with value: 1.9372632750738048.
optuna.visualization.plot_param_importances(study, evaluator=ShapleyImportanceEvaluator(seed=SEED))
ShapleyImportanceEvaluator is experimental (supported from v3.0.0). The interface can change in the future.


optuna.visualization.matplotlib.plot_param_importances(study, evaluator=ShapleyImportanceEvaluator(seed=SEED))
/var/folders/81/n__nnfgd0zbf9m67d0jvmx_c0000gn/T/ipykernel_2553/425946331.py:1: ExperimentalWarning:

ShapleyImportanceEvaluator is experimental (supported from v3.0.0). The interface can change in the future.

/var/folders/81/n__nnfgd0zbf9m67d0jvmx_c0000gn/T/ipykernel_2553/425946331.py:1: ExperimentalWarning:

plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.

<AxesSubplot:title={'center':'Hyperparameter Importances'}, xlabel='Importance for Objective Value', ylabel='Hyperparameter'>








# ignore optuna's experimental warnings
warnings.filterwarnings('ignore', category=optuna.exceptions.ExperimentalWarning)

evaluator = ShapleyImportanceEvaluator(seed=SEED)
importances = evaluator.evaluate(study)
OrderedDict([('min_samples_leaf', 0.27189964137458184),
             ('min_samples_split', 0.07344403282773004),
             ('max_depth', 0.024016461643093377),
             ('n_estimators', 0.003185782057350999)])




fig = plt.figure(dpi=144, facecolor='w')
sns.barplot(data=pd.DataFrame.from_dict(importances, orient='index').transpose(), orient='h', color='#006699')
ax:plt.Axes = plt.gca()
ax.set_xlabel('Importance for Objective Value')
ax.set_title('Hyperparameter Importances')







