import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, RobustScaler, StandardScaler

df = pd.read_csv('train_extra_2.csv')
df

df.columns

Index(['MOFname', 'volume [A^3]', 'weight [u]', 'surface_area [m^2/g]',
       'void_fraction', 'void_volume [cm^3/g]', 'functional_groups',
       'metal_linker', 'organic_linker1', 'organic_linker2', 'topology',
       'CO2/N2_selectivity', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
       'CO2_working_capacity [mL/g]', '_cell_length_a', '_cell_length_b',
       '_cell_length_c', '_cell_angle_alpha', '_cell_angle_beta',
       '_cell_angle_gamma', 'n_atoms', 'mol_avg_mass', 'charges',
       'mol_avg_radius', 'atoms_volume', 'atoms_area'],
      dtype='object')

# let's arrange and select only good features
## features for MLP and NN
# df = df[['MOFname', 'volume [A^3]', 'weight [u]', 'surface_area [m^2/g]',
#        'void_fraction', 'void_volume [cm^3/g]', 'functional_groups',
#        'metal_linker', 'organic_linker1', 'organic_linker2', 'topology',
#        'CO2/N2_selectivity', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
#        'n_atoms', 'mol_avg_mass', 'charges', 'CO2_working_capacity [mL/g]']]

df = df[['MOFname', 'volume [A^3]', 'weight [u]', 'surface_area [m^2/g]',
       'void_fraction', 'void_volume [cm^3/g]', 'functional_groups',
       'metal_linker', 'organic_linker1', 'organic_linker2', 'topology',
       'CO2/N2_selectivity', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
       'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius', 'atoms_volume', 
       'atoms_area', 'CO2_working_capacity [mL/g]']]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 20 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   MOFname                                        68613 non-null  object 
 1   volume [A^3]                                   68613 non-null  float64
 2   weight [u]                                     68613 non-null  float64
 3   surface_area [m^2/g]                           68613 non-null  float64
 4   void_fraction                                  68613 non-null  float64
 5   void_volume [cm^3/g]                           68613 non-null  float64
 6   functional_groups                              68290 non-null  object 
 7   metal_linker                                   68613 non-null  int64  
 8   organic_linker1                                68613 non-null  int64  
 9   organic_linker2                                68613 non-null  int64  
 10  topology                                       68613 non-null  object 
 11  CO2/N2_selectivity                             68613 non-null  float64
 12  heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]  66526 non-null  float64
 13  n_atoms                                        68613 non-null  int64  
 14  mol_avg_mass                                   68613 non-null  float64
 15  charges                                        68613 non-null  int64  
 16  mol_avg_radius                                 68613 non-null  float64
 17  atoms_volume                                   68613 non-null  float64
 18  atoms_area                                     68613 non-null  float64
 19  CO2_working_capacity [mL/g]                    68613 non-null  float64
dtypes: float64(12), int64(5), object(3)
memory usage: 10.5+ MB

df.describe()

def check_all_problems(df):
    if 'functional_groups' in df.columns:
        print('Missing values in Functional_groups:', df['functional_groups'].isnull().sum())
    print(f"Missing values in Heat adsorption: {df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].isnull().sum()}, Inf value in Heat Adsorption: {len(df[df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'] == np.inf])}")
    print(f"0 values in void_volume: {len(df[df['void_volume [cm^3/g]']==0])}")
    print(f"0 values in void_fraction: {len(df[df['void_fraction']==0])}, -1 value: {len(df[df['void_fraction']==-1])}")
    print(f"0 values in surface_area: {len(df[df['surface_area [m^2/g]']==0])}, -1 value: {len(df[df['surface_area [m^2/g]']==-1])}")

check_all_problems(df)

Missing values in Functional_groups: 323
Missing values in Heat adsorption: 2087, Inf value in Heat Adsorption: 2
0 values in void_volume: 3130
0 values in void_fraction: 2874, -1 value: 56
0 values in surface_area: 14483, -1 value: 56

# from a lot of model fitting, functional groups column is barely important
# so, let's drop it
df = df.drop(['functional_groups'], axis=1)
df

# We need to fit the binarizer before we dropping rows with invalid void_volume
# since we are risking dropping some label in these columns and it will raise an error
# if the test set contain some labels which our encoders have never seen before
## ['functional_groups', 'metal_linker', 'organic_linker1', 'organic_linker2', 'topology']

# fg_lb = LabelBinarizer()
# fg_lb.fit(df['functional_groups'])

metal_lb = LabelBinarizer()
metal_lb.fit(df['metal_linker'])

# organic_linker 1 and 2 contain ALL exactly the same label, so fit just 1 of them is OK
organic_lb = LabelBinarizer()
organic_lb.fit(df['organic_linker1'])

topology_lb = LabelBinarizer()
topology_lb.fit(df['topology'])

LabelBinarizer()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 19 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   MOFname                                        68613 non-null  object 
 1   volume [A^3]                                   68613 non-null  float64
 2   weight [u]                                     68613 non-null  float64
 3   surface_area [m^2/g]                           68613 non-null  float64
 4   void_fraction                                  68613 non-null  float64
 5   void_volume [cm^3/g]                           68613 non-null  float64
 6   metal_linker                                   68613 non-null  int64  
 7   organic_linker1                                68613 non-null  int64  
 8   organic_linker2                                68613 non-null  int64  
 9   topology                                       68613 non-null  object 
 10  CO2/N2_selectivity                             68613 non-null  float64
 11  heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]  66526 non-null  float64
 12  n_atoms                                        68613 non-null  int64  
 13  mol_avg_mass                                   68613 non-null  float64
 14  charges                                        68613 non-null  int64  
 15  mol_avg_radius                                 68613 non-null  float64
 16  atoms_volume                                   68613 non-null  float64
 17  atoms_area                                     68613 non-null  float64
 18  CO2_working_capacity [mL/g]                    68613 non-null  float64
dtypes: float64(12), int64(5), object(2)
memory usage: 9.9+ MB

# all these cutting values, we look at it from EDA
df = df.drop(df[df['volume [A^3]'] > 100000].index, axis=0)
df = df.drop(df[df['CO2/N2_selectivity'] > 10000].index, axis=0)

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 3126
0 values in void_fraction: 2871, -1 value: 56
0 values in surface_area: 14479, -1 value: 56

df

di_cnn_feats = pd.read_csv('train_feats_from_di_cnn.csv')
di_cnn_feats

comb_cnn_feats = pd.read_csv('train_feats_from_comb_cnn.csv')
comb_cnn_feats

df = df.reset_index(drop=True)

df = pd.concat([df, di_cnn_feats, comb_cnn_feats], axis=1)
df

df.insert(
    loc=2,
    column="density [g/cm^3]",
    value=(df["weight [u]"] / df["volume [A^3]"]) * 1.66054,
)

df

fill_void_vol_df = df[(df['void_volume [cm^3/g]']==0) & ((df['void_fraction']!=0) & (df['void_fraction']!=-1))]
fill_void_vol_df

df.loc[((df['void_volume [cm^3/g]']==0) & ((df['void_fraction']!=0) & (df['void_fraction']!=-1))), 'void_volume [cm^3/g]'] = fill_void_vol_df['void_fraction'] / fill_void_vol_df['density [g/cm^3]']

# verify
df[(df['void_volume [cm^3/g]']==0) & ((df['void_fraction']!=0) & (df['void_fraction']!=-1))]

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 2927
0 values in void_fraction: 2871, -1 value: 56
0 values in surface_area: 14479, -1 value: 56

df.insert(
    loc=2,
    column="volume [cm^3/g]",
    value=df['volume [A^3]'] / (df['weight [u]'] * 1.66054),
)

df

# q80 variable here should be named to q765, we forgot it
voidvol_q80 = df[df['void_volume [cm^3/g]']!=0]['void_volume [cm^3/g]'].quantile(0.765)
voidvol_q80

0.4733999999999999

df.loc[df['void_volume [cm^3/g]']==0, 'void_volume [cm^3/g]'] = voidvol_q80

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 0
0 values in void_fraction: 2871, -1 value: 56
0 values in surface_area: 14479, -1 value: 56

temp_df = df[df['void_fraction'].isin([0, -1])]
temp_df

df.loc[df['void_fraction'].isin([0, -1]), 'void_fraction'] = temp_df['density [g/cm^3]'] * temp_df['void_volume [cm^3/g]']

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 14479, -1 value: 56

df_imp = df.copy()

df_imp.loc[df_imp['surface_area [m^2/g]']==0, 'surface_area [m^2/g]'] = np.NaN
df_imp.loc[df_imp['surface_area [m^2/g]']==-1, 'surface_area [m^2/g]'] = np.NaN
df_imp

df_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68602 entries, 0 to 68601
Data columns (total 36 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   MOFname                                        68602 non-null  object 
 1   volume [A^3]                                   68602 non-null  float64
 2   volume [cm^3/g]                                68602 non-null  float64
 3   density [g/cm^3]                               68602 non-null  float64
 4   weight [u]                                     68602 non-null  float64
 5   surface_area [m^2/g]                           54067 non-null  float64
 6   void_fraction                                  68602 non-null  float64
 7   void_volume [cm^3/g]                           68602 non-null  float64
 8   metal_linker                                   68602 non-null  int64  
 9   organic_linker1                                68602 non-null  int64  
 10  organic_linker2                                68602 non-null  int64  
 11  topology                                       68602 non-null  object 
 12  CO2/N2_selectivity                             68602 non-null  float64
 13  heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]  66519 non-null  float64
 14  n_atoms                                        68602 non-null  int64  
 15  mol_avg_mass                                   68602 non-null  float64
 16  charges                                        68602 non-null  int64  
 17  mol_avg_radius                                 68602 non-null  float64
 18  atoms_volume                                   68602 non-null  float64
 19  atoms_area                                     68602 non-null  float64
 20  CO2_working_capacity [mL/g]                    68602 non-null  float64
 21  di_cnn_1                                       68602 non-null  float64
 22  di_cnn_2                                       68602 non-null  float64
 23  di_cnn_4                                       68602 non-null  float64
 24  di_cnn_5                                       68602 non-null  float64
 25  di_cnn_6                                       68602 non-null  float64
 26  di_cnn_8                                       68602 non-null  float64
 27  di_cnn_10                                      68602 non-null  float64
 28  di_cnn_12                                      68602 non-null  float64
 29  di_cnn_13                                      68602 non-null  float64
 30  di_cnn_14                                      68602 non-null  float64
 31  di_cnn_15                                      68602 non-null  float64
 32  comb_cnn_0                                     68602 non-null  float64
 33  comb_cnn_1                                     68602 non-null  float64
 34  comb_cnn_2                                     68602 non-null  float64
 35  comb_cnn_3                                     68602 non-null  float64
dtypes: float64(29), int64(5), object(2)
memory usage: 18.8+ MB

df_cat = df_imp[['MOFname', 'topology']]
df_imp = df_imp[['volume [cm^3/g]', 'weight [u]', 'surface_area [m^2/g]', 'void_volume [cm^3/g]',
                 'CO2/N2_selectivity', 'n_atoms', 'charges']]
df_imp

surface_mean = df_imp['surface_area [m^2/g]'].mean()
surface_std = df_imp['surface_area [m^2/g]'].std()

# features to be scaled before using in imputation
feats_to_scale = ['volume [cm^3/g]', 'weight [u]', 'surface_area [m^2/g]', 'CO2/N2_selectivity',
                 'n_atoms', 'charges']

imp_scalers = {}
for feat in feats_to_scale:
    print(feat)
    scaler = RobustScaler()
    scaler.fit(df_imp[[feat]])
    scaled_feat = scaler.transform(df_imp[[feat]])
    df_imp[feat] = scaled_feat
    imp_scalers[feat] = scaler

volume [cm^3/g]
weight [u]
surface_area [m^2/g]
CO2/N2_selectivity
n_atoms
charges

c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
c:\users\admin\anaconda3\envs\chemistry-ml\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 14479, -1 value: 56

from impyute.imputation.cs import mice

# start the MICE training
imputed_surface=mice(df_imp.values)
imputed_surface.shape

(68602, 7)

col_list = df_imp.columns.tolist()

df_impute = pd.DataFrame(imputed_surface, columns = col_list)
df_impute

# transform the value back before feeding back into the main data
df_impute['surface_area [m^2/g]'] = imp_scalers['surface_area [m^2/g]'].inverse_transform(df_impute[['surface_area [m^2/g]']])
df_impute

df_impute

df = df.reset_index(drop=True)
df_impute = df_impute.reset_index(drop=True)

df['surface_area [m^2/g]'] = df_impute['surface_area [m^2/g]']
df

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 2
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 0, -1 value: 0

heat_min = df[(df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'] != np.inf) & (~df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].isnull())]['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].min()
heat_min

1.612299

df.loc[df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'] == np.inf, 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'] = heat_min

check_all_problems(df)

Missing values in Heat adsorption: 2083, Inf value in Heat Adsorption: 0
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 0, -1 value: 0

heat_min = df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].min()
df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'] = df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].fillna(heat_min)

check_all_problems(df)

Missing values in Heat adsorption: 0, Inf value in Heat Adsorption: 0
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 0, -1 value: 0

df['specific_heat'] = (df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]']*4.1868*1000)/(df['weight [u]']*65)
df['specific_heat']

0        0.499411
1        0.208154
2        0.528491
3        0.305615
4        0.462737
           ...   
68597    0.103676
68598    0.069545
68599    0.052999
68600    0.028541
68601    0.050140
Name: specific_heat, Length: 68602, dtype: float64

# drop name and those columns we used
X = df.drop(['MOFname','CO2_working_capacity [mL/g]'], axis = 1)
y = df['CO2_working_capacity [mL/g]']

X

X = X.rename(columns={'volume [A^3]':'volume', 'volume [cm^3/g]':'volume/g', 'density [g/cm^3]':'density',
         'weight [u]':'weight', 'surface_area [m^2/g]':'surface_area', 'void_volume [cm^3/g]':'void_volume',
         'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]':'heat_adsorption'
         })

X

# total_fg = len(fg_lb.classes_)

## We comment this cuz we have already fir the binarizer on the top
# fg_lb = LabelBinarizer()
# fg_lb.fit(X['functional_groups'])

# encoded_fg = fg_lb.transform(X['functional_groups'])
# encoded_fg.shape

# drop the original one
# X = X.drop(['functional_groups'], axis = 1)

# fill one-hot encoded functional_groups to X
# for i in range(total_fg):
#     X[f'functional_groups_{i}'] = encoded_fg[:, i]

# X

total_metal = len(metal_lb.classes_)
total_metal

7

encoded_metal = metal_lb.transform(X['metal_linker'])
encoded_metal.shape

(68602, 7)

# drop the original one
X = X.drop(['metal_linker'], axis = 1)

# fill one-hot encoded metal_linker to X
for i in range(total_metal):
    X[f'metal_linker_{i}'] = encoded_metal[:, i]

X

np.sort(df['organic_linker1'].unique()) == np.sort(df['organic_linker2'].unique())

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

total_organic = len(organic_lb.classes_)
total_organic

57

encoded_organic1 = organic_lb.transform(X['organic_linker1'])
encoded_organic2 = organic_lb.transform(X['organic_linker2'])
encoded_organic1.shape, encoded_organic2.shape

((68602, 57), (68602, 57))

# drop the original one
X = X.drop(['organic_linker1', 'organic_linker2'], axis = 1)

# fill one-hot encoded version to X

# linker 1
for i in range(total_organic):
    X[f'organic_linker1_{i}'] = encoded_organic1[:, i]

# linker 2
for i in range(total_organic):
    X[f'organic_linker2_{i}'] = encoded_organic2[:, i]

X

total_topology = len(topology_lb.classes_)
total_topology

11

encoded_topology = topology_lb.transform(X['topology'])
encoded_topology.shape

(68602, 11)

# drop the original one
X = X.drop(['topology'], axis = 1)

# fill one-hot encoded functional_groups to X
for i in range(total_topology):
    X[f'topology_{i}'] = encoded_topology[:, i]

X

# X.to_csv('train_extra_2_no_scaling.csv', index=False)

X.columns[:20]

Index(['volume', 'volume/g', 'density', 'weight', 'surface_area',
       'void_fraction', 'void_volume', 'CO2/N2_selectivity', 'heat_adsorption',
       'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius', 'atoms_volume',
       'atoms_area', 'di_cnn_1', 'di_cnn_2', 'di_cnn_4', 'di_cnn_5',
       'di_cnn_6'],
      dtype='object')

feats_to_scale = ['volume', 'volume/g', 'density', 'weight', 'surface_area',
                 'void_fraction', 'void_volume', 'CO2/N2_selectivity', 'heat_adsorption',
                 'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius', 'atoms_volume', 'atoms_area','di_cnn_1',
         'di_cnn_2', 'di_cnn_4', 'di_cnn_5', 'di_cnn_6', 'di_cnn_8', 'di_cnn_10',
         'di_cnn_12', 'di_cnn_13', 'di_cnn_14', 'di_cnn_15', 'comb_cnn_0', 
         'comb_cnn_1', 'comb_cnn_2', 'comb_cnn_3', 'specific_heat']

rb_scaler = RobustScaler()
rb_scaler.fit(X[feats_to_scale])

RobustScaler()

scaled_feats = rb_scaler.transform(X[feats_to_scale])
scaled_feats.shape

(68602, 31)

for i, feat in enumerate(feats_to_scale):
    print(feat)
    X[feat] = scaled_feats[:, i]

volume
volume/g
density
weight
surface_area
void_fraction
void_volume
CO2/N2_selectivity
heat_adsorption
n_atoms
mol_avg_mass
charges
mol_avg_radius
atoms_volume
atoms_area
di_cnn_1
di_cnn_2
di_cnn_4
di_cnn_5
di_cnn_6
di_cnn_8
di_cnn_10
di_cnn_12
di_cnn_13
di_cnn_14
di_cnn_15
comb_cnn_0
comb_cnn_1
comb_cnn_2
comb_cnn_3
specific_heat

## Same results with 1 scaler for all feats
# let's try 1 scaler for 1 feature
# scalers = {}
# for feat in feats_to_scale:
#     print(feat)
#     scaler = RobustScaler()
#     scaler.fit(X[[feat]])
#     scaled_feat = scaler.transform(X[[feat]])
#     X[feat] = scaled_feat
#     scalers[feat] = scaler

clean_X = X.copy()

clean_X

clean_X.columns[:15]

Index(['volume', 'volume/g', 'density', 'weight', 'surface_area',
       'void_fraction', 'void_volume', 'CO2/N2_selectivity', 'heat_adsorption',
       'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius', 'atoms_volume',
       'atoms_area'],
      dtype='object')

X

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pytorch_tabnet.tab_model import TabNetRegressor

import pickle

# gb_best_params_am = pickle.load(open('pickle/gb_best_params_am.pickle', 'rb'))

# gb_best_params_wine = pickle.load(open('pickle/gb_best_params_wine.pickle', 'rb'))

# mlp_best_params = pickle.load(open('pickle/mlp_best_params.pickle', 'rb'))

# lgb_best_params = pickle.load(open('pickle/lgb_best_params.pickle', 'rb'))

# tab_best_params = pickle.load(open('pickle/tabnet_best_params.pickle', 'rb'))

gb_best_params = {'n_estimators': 975, 
                  'max_depth': 20, 
                  'learning_rate': 0.07336551102484597, 
                  'subsample': 0.9780814357664563, 
                  'validation_fraction': 0.23861317396041049, 
                  'max_leaf_nodes': 30}
gb = GradientBoostingRegressor(**gb_best_params)
gb.fit(clean_X,y)

GradientBoostingRegressor(learning_rate=0.07336551102484597, max_depth=20,
                          max_leaf_nodes=30, n_estimators=975,
                          subsample=0.9780814357664563,
                          validation_fraction=0.23861317396041049)

y_hat = gb.predict(clean_X)

mean_absolute_error(y, y_hat)

13.382120543161381

r2_score(y, y_hat)

0.9575112806742445

# joblib.dump(gb, 'models/gb_best_model.joblib')

['models/gb_best_model.joblib']

# current best params
mlp = MLPRegressor(activation = "relu", alpha = 0.01, hidden_layer_sizes = (128,256,256,128,64,16),
                   solver='adam', max_iter = 200, random_state = 42, verbose=True, early_stopping=True,
                   validation_fraction=0.2, learning_rate_init=0.001, batch_size=200)
mlp.fit(clean_X,y)

Iteration 1, loss = 1442.73342612
Validation score: 0.913845
Iteration 2, loss = 357.79165794
Validation score: 0.915173
Iteration 3, loss = 354.24238166
Validation score: 0.915162
Iteration 4, loss = 356.11294425
Validation score: 0.913517
Iteration 5, loss = 350.75659481
Validation score: 0.916669
Iteration 6, loss = 352.74777847
Validation score: 0.912908
Iteration 7, loss = 348.73032167
Validation score: 0.905030
Iteration 8, loss = 355.75492122
Validation score: 0.916073
Iteration 9, loss = 346.63812487
Validation score: 0.910881
Iteration 10, loss = 346.65057995
Validation score: 0.917029
Iteration 11, loss = 344.96513343
Validation score: 0.911771
Iteration 12, loss = 343.28964006
Validation score: 0.915064
Iteration 13, loss = 342.38614522
Validation score: 0.917365
Iteration 14, loss = 345.56449837
Validation score: 0.903331
Iteration 15, loss = 351.44812042
Validation score: 0.910155
Iteration 16, loss = 341.80958294
Validation score: 0.915936
Iteration 17, loss = 341.80853920
Validation score: 0.913732
Iteration 18, loss = 346.63825835
Validation score: 0.916550
Iteration 19, loss = 340.23027188
Validation score: 0.902290
Iteration 20, loss = 341.74275672
Validation score: 0.912319
Iteration 21, loss = 342.14433145
Validation score: 0.914689
Iteration 22, loss = 336.29356435
Validation score: 0.916865
Iteration 23, loss = 338.90157567
Validation score: 0.916807
Iteration 24, loss = 339.15099034
Validation score: 0.917057
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.

MLPRegressor(alpha=0.01, batch_size=200, early_stopping=True,
             hidden_layer_sizes=(128, 256, 256, 128, 64, 16), random_state=42,
             validation_fraction=0.2, verbose=True)

#Calculate Surface
y_hat = mlp.predict(clean_X)
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
## look at MAE is more accurate
print('r2 square:', r2_score(y, y_hat))

MAE (more accurate):  17.472911079782566
r2 square: 0.9168742154213051

# joblib.dump(mlp, 'models/mlp_current_best_impute.joblib')

lgb_params = {'n_estimators': 975, 
              'reg_lambda': 0.01,
              'max_depth': 25, 
              'learning_rate': 0.07336551102484597, 
              'subsample': 0.9780814357664563, 
              'num_leaves': 30,
              'random_state':42}

# current best params
lgb = LGBMRegressor(**lgb_params)
lgb.fit(clean_X,y)

LGBMRegressor(learning_rate=0.07336551102484597, max_depth=25, n_estimators=975,
              num_leaves=30, random_state=42, reg_lambda=0.01,
              subsample=0.9780814357664563)

y_hat = lgb.predict(clean_X)
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
## look at MAE is more accurate
print('r2 square:', r2_score(y, y_hat))

MAE (more accurate):  13.742429322623202
r2 square: 0.9526661705590186

# joblib.dump(lgb, 'models/lgb_current_best.joblib')

['models/lgb_current_best.joblib']

fixed_tab_best_params = {'mask_type': 'entmax',
                         'n_d': 25,
                         'n_a': 25,
                         'n_steps': 3,
                         'momentum': 0.010908117575012286,
                         'gamma': 0.7,
                         'n_shared': 2,
                         'lambda_sparse': 0.00027473030620813675}

# current best params
tab = TabNetRegressor(**fixed_tab_best_params)
tab.fit(np.array(clean_X),np.expand_dims(y, axis=-1))

Device used : cuda
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 8953.12903|  0:00:33s
epoch 1  | loss: 852.49627|  0:00:58s
epoch 2  | loss: 814.31683|  0:01:24s
epoch 3  | loss: 792.81048|  0:01:50s
epoch 4  | loss: 770.55513|  0:02:16s
epoch 5  | loss: 765.43522|  0:02:41s
epoch 6  | loss: 751.48773|  0:03:07s
epoch 7  | loss: 771.54519|  0:03:31s
epoch 8  | loss: 740.09581|  0:03:56s
epoch 9  | loss: 733.72821|  0:04:21s
epoch 10 | loss: 748.211 |  0:04:47s
epoch 11 | loss: 736.82183|  0:05:13s
epoch 12 | loss: 742.05901|  0:05:42s
epoch 13 | loss: 733.09496|  0:06:11s
epoch 14 | loss: 732.48891|  0:06:42s
epoch 15 | loss: 740.575 |  0:07:07s
epoch 16 | loss: 735.54345|  0:07:37s
epoch 17 | loss: 721.28023|  0:07:59s
epoch 18 | loss: 729.66604|  0:08:05s
epoch 19 | loss: 714.50833|  0:08:11s
epoch 20 | loss: 721.77896|  0:08:18s
epoch 21 | loss: 714.39629|  0:08:46s
epoch 22 | loss: 722.22848|  0:09:11s
epoch 23 | loss: 712.15957|  0:09:36s
epoch 24 | loss: 705.44512|  0:10:01s
epoch 25 | loss: 708.09923|  0:10:30s
epoch 26 | loss: 703.12696|  0:10:58s
epoch 27 | loss: 707.21793|  0:11:29s
epoch 28 | loss: 705.9106|  0:11:55s
epoch 29 | loss: 694.68936|  0:12:20s
epoch 30 | loss: 700.87022|  0:12:44s
epoch 31 | loss: 716.3981|  0:13:11s
epoch 32 | loss: 691.63925|  0:13:35s
epoch 33 | loss: 692.69606|  0:14:02s
epoch 34 | loss: 701.88751|  0:14:27s
epoch 35 | loss: 690.54492|  0:14:53s
epoch 36 | loss: 691.28716|  0:15:23s
epoch 37 | loss: 691.62379|  0:15:51s
epoch 38 | loss: 692.16076|  0:16:17s
epoch 39 | loss: 684.27196|  0:16:45s
epoch 40 | loss: 687.81668|  0:17:12s
epoch 41 | loss: 677.19113|  0:17:39s
epoch 42 | loss: 681.67597|  0:18:02s
epoch 43 | loss: 676.0142|  0:18:09s
epoch 44 | loss: 679.84201|  0:18:15s
epoch 45 | loss: 674.48857|  0:18:21s
epoch 46 | loss: 676.14036|  0:18:42s
epoch 47 | loss: 669.31486|  0:19:10s
epoch 48 | loss: 670.04879|  0:19:36s
epoch 49 | loss: 668.82035|  0:20:04s
epoch 50 | loss: 660.84246|  0:20:31s
epoch 51 | loss: 670.62756|  0:20:59s
epoch 52 | loss: 661.31606|  0:21:26s
epoch 53 | loss: 660.19258|  0:21:54s
epoch 54 | loss: 654.09388|  0:22:24s
epoch 55 | loss: 649.882 |  0:22:52s
epoch 56 | loss: 648.81973|  0:23:20s
epoch 57 | loss: 652.7906|  0:23:49s
epoch 58 | loss: 652.61544|  0:24:17s
epoch 59 | loss: 650.78513|  0:24:45s
epoch 60 | loss: 646.05664|  0:25:13s
epoch 61 | loss: 653.3171|  0:25:41s
epoch 62 | loss: 647.61954|  0:26:08s
epoch 63 | loss: 651.46016|  0:26:36s
epoch 64 | loss: 643.71479|  0:27:04s
epoch 65 | loss: 637.70394|  0:27:32s
epoch 66 | loss: 639.8815|  0:28:00s
epoch 67 | loss: 637.08897|  0:28:28s
epoch 68 | loss: 631.35882|  0:28:56s
epoch 69 | loss: 637.44037|  0:29:25s
epoch 70 | loss: 639.53357|  0:29:54s
epoch 71 | loss: 630.20927|  0:30:21s
epoch 72 | loss: 619.02502|  0:30:49s
epoch 73 | loss: 630.20647|  0:31:18s
epoch 74 | loss: 629.56747|  0:31:47s
epoch 75 | loss: 625.35182|  0:32:15s
epoch 76 | loss: 616.51182|  0:32:42s
epoch 77 | loss: 621.1279|  0:33:10s
epoch 78 | loss: 626.442 |  0:33:38s
epoch 79 | loss: 617.54879|  0:34:07s
epoch 80 | loss: 615.93781|  0:34:36s
epoch 81 | loss: 610.93298|  0:35:03s
epoch 82 | loss: 609.96735|  0:35:31s
epoch 83 | loss: 610.75022|  0:35:59s
epoch 84 | loss: 608.97901|  0:36:27s
epoch 85 | loss: 604.28721|  0:36:55s
epoch 86 | loss: 607.05691|  0:37:22s
epoch 87 | loss: 617.39979|  0:37:50s
epoch 88 | loss: 602.43495|  0:38:18s
epoch 89 | loss: 599.22233|  0:38:45s
epoch 90 | loss: 602.12066|  0:39:26s
epoch 91 | loss: 598.58312|  0:39:54s
epoch 92 | loss: 599.88362|  0:40:22s
epoch 93 | loss: 597.34844|  0:40:51s
epoch 94 | loss: 592.66699|  0:41:20s
epoch 95 | loss: 595.19304|  0:41:48s
epoch 96 | loss: 592.30365|  0:42:17s
epoch 97 | loss: 596.72432|  0:42:44s
epoch 98 | loss: 587.99535|  0:43:13s
epoch 99 | loss: 590.58971|  0:43:42s

y_hat = tab.predict(np.array(clean_X))
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
## look at MAE is more accurate
print('r2 square:', r2_score(y, y_hat))

MAE (more accurate):  16.404377488337662
r2 square: 0.9307090121817564

svr_best_params = {'kernel': 'rbf',
                  'gamma': 'scale',
                  'C': 0.1}
svr = SVR(**svr_best_params)
svr.fit(clean_X,y)

y_hat = svr.predict(clean_X)
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
print('r2 square:', r2_score(y, y_hat))

xg_best_params = {'booster': 'dart', 
                  'lambda': 0.1978947368034619, 
                  'alpha': 0.0009102278884164788, 
                  'max_depth': 4, 
                  'eta': 0.6864948971995669, 
                  'gamma': 5.2062504664042595e-05, 
                  'grow_policy': 'lossguide', 
                  'sample_type': 'weighted', 
                  'normalize_type': 'forest', 
                  'rate_drop': 4.887412967843037e-08, 
                  'skip_drop': 2.0347272399655997e-05}

xg = XGBRegressor(**xg_best_params)
xg.fit(clean_X,y)

y_hat = xg.predict(clean_X)
print('MAE: ',mean_absolute_error(y, y_hat))
print('r2 square:', r2_score(y, y_hat))

rf_best_params = {'n_estimators': 934, 'max_depth': 12}

rf = RandomForestRegressor(**rf_best_params)
rf.fit(clean_X,y)

y_hat = rf.predict(clean_X)
print('MSE: ',mean_squared_error(y, y_hat))
print('r2 square:', r2_score(y, y_hat))

from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Dropout, Conv1D, AveragePooling1D, AveragePooling2D
from tensorflow.keras.layers import Input, Flatten, Reshape, Lambda, BatchNormalization, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K

from sklearn.model_selection import KFold

tf.keras.backend.clear_session()

BATCH_SIZE = 200
EPOCHS = 60
INIT_LR = 0.001

# sklearn use GlorotUniform (from source code)
xavier = tf.keras.initializers.GlorotUniform(seed=42)

optimizer = Adam(learning_rate=INIT_LR, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

# , kernel_regularizer=L2(0.01)

#(128,256,256,128,64,16)
model = Sequential([
    InputLayer(input_shape=(clean_X.shape[1],), dtype='float32'),
    Dense(128, activation='relu', kernel_initializer=xavier),
    Dense(256, activation='relu', kernel_initializer=xavier),
    Dense(256, activation='relu', kernel_initializer=xavier, kernel_regularizer=L2(0.01)),
    Dense(128, activation='relu', kernel_initializer=xavier),
    Dense(64, activation='relu', kernel_initializer=xavier, kernel_regularizer=L2(0.01)),
    Dense(16, activation='relu', kernel_initializer=xavier),
    Dense(1, activation='linear', kernel_initializer=xavier, kernel_regularizer=L2(0.01)),
])

# callbacks
early_stopping = EarlyStopping(patience=10, restore_best_weights=True, monitor='val_r2_metric', mode='max')

def r2_metric(y_true, y_pred):
    numerator = K.sum((y_true - y_pred) ** 2, axis=0)
    denominator = K.sum((y_true - K.mean(y_pred, axis=0)) ** 2, axis=0)
    return 1-(numerator/denominator)

model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2_metric])

model.fit(clean_X, y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1,
          validation_split=0.2,
          callbacks=[early_stopping]
         )

y_hat = model.predict(clean_X).flatten()
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
## look at MAE is more accurate
print('r2 square:', r2_score(y, y_hat))

# TF model with KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True)
cv_loss_per_fold = []
train_loss_per_fold = []
fold_no = 1
inputs = np.array(clean_X)
targets = np.array(y)
for train, test in kfold.split(inputs, targets):
    model = Sequential([
    InputLayer(input_shape=(clean_X.shape[1],), dtype='float32'),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='linear')
    ])
    
    model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=[])
    
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    H = model.fit(inputs[train], targets[train],
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1,
#           validation_split=0.2,
#           callbacks=[early_stopping]
          )
    
    # Generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test], verbose=0)
    print(f'Training loss for fold {fold_no}: {H.history["loss"][-1]}')
    print(f'CV Score for fold {fold_no}: {model.metrics_names[0]} of {scores}')
    cv_loss_per_fold.append(scores)
    train_loss_per_fold.append(H.history["loss"][-1])
    fold_no += 1
    
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('CV Score per fold')
for i in range(0, len(cv_loss_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {cv_loss_per_fold[i]}')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Train Average Loss: {np.mean(train_loss_per_fold)}')
print(f'> CV Average Loss: {np.mean(cv_loss_per_fold)}')
print('------------------------------------------------------------------------')

test

def r2_metric(y_true, y_pred):
    numerator = K.sum((y_true - y_pred) ** 2, axis=0)
    denominator = K.sum((y_true - K.mean(y_pred, axis=0)) ** 2, axis=0)
    return 1-(numerator/denominator)

cnn_X = np.expand_dims(clean_X, axis=-1)

tf.keras.backend.clear_session()

BATCH_SIZE = 200
EPOCHS = 60
INIT_LR = 0.001

# sklearn use GlorotUniform (from source code)
xavier = tf.keras.initializers.GlorotUniform(seed=42)
optimizer = Adam(learning_rate=INIT_LR, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

# callbacks
early_stopping = EarlyStopping(patience=10, restore_best_weights=True, monitor='val_r2_metric', mode='max')

#(128,256,256,128,64,16)
model = Sequential([
    InputLayer(input_shape=(clean_X.shape[1],1), dtype='float32'),
    Dense(128, activation='relu', kernel_initializer=xavier),
    Conv1D(256, 3, activation='relu', padding='same'),
    AveragePooling1D(),
    Conv1D(256, 3, activation='relu', padding='same'),
    Dropout(0.2),
    Conv1D(128, 3, activation='relu', padding='same'),
    Dropout(0.2),
    Conv1D(64, 3, activation='relu', padding='same'),
    AveragePooling1D(),
    Dropout(0.2),
    Flatten(),
    Dense(16, activation='relu'),
    Dense(1, activation='linear')
])

inputs = Input(shape=(clean_X.shape[1],), dtype='float32')
x = Dense(4096, activation='relu')(inputs)
x = Reshape((256, 16))(x)
x = BatchNormalization()(x)
x = Conv1D(512, 3, activation='relu', padding='same')(x)
x = AveragePooling1D()(x)
x = BatchNormalization()(x)
x_s = Conv1D(512, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Conv1D(512, 3, activation='relu', padding='same')(x_s)
x = BatchNormalization()(x)
x = Conv1D(512, 3, activation='relu', padding='same')(x)

sum_layer = Lambda(lambda x: K.sum(x, axis=0, keepdims=False))([x, x_s])
x = AveragePooling1D()(sum_layer)
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x)
x = Dense(1, activation='linear')(x)

model = tf.keras.models.Model(inputs=inputs, outputs=x)

model.summary()

model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2_metric])

cnn_X.shape

model.fit(cnn_X.reshape((clean_X.shape[0], 1,clean_X.shape[1])), y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1,
          validation_split=0.2
#           callbacks=[early_stopping]
         )

cnn_X.reshape((clean_X.shape[0], 1,clean_X.shape[1])).shape

cnn_X.shape

y_hat = model.predict(cnn_X.reshape((clean_X.shape[0], 1,clean_X.shape[1]))).flatten()
print('MAE (more accurate): ',mean_absolute_error(y, y_hat))
## look at MAE is more accurate
print('r2 square:', r2_score(y, y_hat))

test_df = pd.read_csv('test_extra_2.csv')
test_new_feats = pd.read_csv('test_feats_from_di_cnn.csv')
test_comb_feats = pd.read_csv('test_feats_from_comb_cnn.csv')

test_df = pd.concat([test_df, test_new_feats, test_comb_feats], axis=1)

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 40 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   MOFname                                        17000 non-null  object 
 1   volume [A^3]                                   17000 non-null  float64
 2   weight [u]                                     17000 non-null  float64
 3   surface_area [m^2/g]                           17000 non-null  float64
 4   void_fraction                                  17000 non-null  float64
 5   void_volume [cm^3/g]                           17000 non-null  float64
 6   functional_groups                              17000 non-null  object 
 7   metal_linker                                   17000 non-null  int64  
 8   organic_linker1                                17000 non-null  int64  
 9   organic_linker2                                17000 non-null  int64  
 10  topology                                       17000 non-null  object 
 11  CO2/N2_selectivity                             17000 non-null  float64
 12  heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]  17000 non-null  float64
 13  _cell_length_a                                 17000 non-null  float64
 14  _cell_length_b                                 17000 non-null  float64
 15  _cell_length_c                                 17000 non-null  float64
 16  _cell_angle_alpha                              17000 non-null  float64
 17  _cell_angle_beta                               17000 non-null  float64
 18  _cell_angle_gamma                              17000 non-null  float64
 19  n_atoms                                        17000 non-null  int64  
 20  mol_avg_mass                                   17000 non-null  float64
 21  charges                                        17000 non-null  int64  
 22  mol_avg_radius                                 17000 non-null  float64
 23  atoms_volume                                   17000 non-null  float64
 24  atoms_area                                     17000 non-null  float64
 25  di_cnn_1                                       17000 non-null  float64
 26  di_cnn_2                                       17000 non-null  float64
 27  di_cnn_4                                       17000 non-null  float64
 28  di_cnn_5                                       17000 non-null  float64
 29  di_cnn_6                                       17000 non-null  float64
 30  di_cnn_8                                       17000 non-null  float64
 31  di_cnn_10                                      17000 non-null  float64
 32  di_cnn_12                                      17000 non-null  float64
 33  di_cnn_13                                      17000 non-null  float64
 34  di_cnn_14                                      17000 non-null  float64
 35  di_cnn_15                                      17000 non-null  float64
 36  comb_cnn_0                                     17000 non-null  float64
 37  comb_cnn_1                                     17000 non-null  float64
 38  comb_cnn_2                                     17000 non-null  float64
 39  comb_cnn_3                                     17000 non-null  float64
dtypes: float64(32), int64(5), object(3)
memory usage: 5.2+ MB

check_all_problems(test_df)

Missing values in Functional_groups: 0
Missing values in Heat adsorption: 0, Inf value in Heat Adsorption: 0
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 0, -1 value: 0

# preprocess

test_df = test_df[['MOFname', 'volume [A^3]', 'weight [u]', 'surface_area [m^2/g]',
       'void_fraction', 'void_volume [cm^3/g]', 'functional_groups',
       'metal_linker', 'organic_linker1', 'organic_linker2', 'topology',
       'CO2/N2_selectivity', 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
       'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius', 'atoms_volume', 'atoms_area','di_cnn_1',
         'di_cnn_2', 'di_cnn_4', 'di_cnn_5', 'di_cnn_6', 'di_cnn_8', 'di_cnn_10',
         'di_cnn_12', 'di_cnn_13', 'di_cnn_14', 'di_cnn_15', 'comb_cnn_0', 'comb_cnn_1', 'comb_cnn_2', 'comb_cnn_3']]

test_df = test_df.drop(['functional_groups'], axis=1)

test_df.insert(
    loc=2,
    column="density [g/cm^3]",
    value=(test_df["weight [u]"] / test_df["volume [A^3]"]) * 1.66054,
)

test_df.insert(
    loc=2,
    column="volume [cm^3/g]",
    value=test_df['volume [A^3]'] / (test_df['weight [u]'] * 1.66054),
)

test_df['specific_heat'] = (test_df['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]']*4.1868*1000)/(test_df['weight [u]']*65)

# drop name and those ratio we used
test_X = test_df.drop(['MOFname'], axis = 1)
test_X = test_X.rename(columns={'volume [A^3]':'volume', 'volume [cm^3/g]':'volume/g', 'density [g/cm^3]':'density',
         'weight [u]':'weight', 'surface_area [m^2/g]':'surface_area', 'void_volume [cm^3/g]':'void_volume',
         'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]':'heat_adsorption'
         })

# one hot
# ## functioncal groups
# test_encoded_fg = fg_lb.transform(test_X['functional_groups'])
# test_X = test_X.drop(['functional_groups'], axis = 1)
# # fill one-hot encoded functional_groups to X
# for i in range(total_fg):
#     test_X[f'functional_groups_{i}'] = test_encoded_fg[:, i]

## metal_linker
test_encoded_metal = metal_lb.transform(test_X['metal_linker'])
test_X = test_X.drop(['metal_linker'], axis = 1)
# fill one-hot encoded metal_linker to X
for i in range(total_metal):
    test_X[f'metal_linker_{i}'] = test_encoded_metal[:, i]

## organic_linker
test_encoded_organic1 = organic_lb.transform(test_X['organic_linker1'])
test_encoded_organic2 = organic_lb.transform(test_X['organic_linker2'])
test_X = test_X.drop(['organic_linker1', 'organic_linker2'], axis = 1)

# fill one-hot encoded version to X
# linker 1
for i in range(total_organic):
    test_X[f'organic_linker1_{i}'] = test_encoded_organic1[:, i]

# linker 2
for i in range(total_organic):
    test_X[f'organic_linker2_{i}'] = test_encoded_organic2[:, i]

## topology
test_encoded_topology = topology_lb.transform(test_X['topology'])
test_X = test_X.drop(['topology'], axis = 1)
# fill one-hot encoded functional_groups to X
for i in range(total_topology):
    test_X[f'topology_{i}'] = test_encoded_topology[:, i]
    
# feature scaling
# test_scaled_feats = mm_scaler.transform(test_X[feats_to_scale])
test_scaled_feats = rb_scaler.transform(test_X[feats_to_scale])
for i, feat in enumerate(feats_to_scale):
    test_X[feat] = test_scaled_feats[:, i]

## 1 scaler for 1 feat (same result)
# for feat in feats_to_scale:
#     scaled_feat = scalers[feat].transform(test_X[[feat]])
#     test_X[feat] = scaled_feat

test_X

check_all_problems(test_df)

Missing values in Heat adsorption: 0, Inf value in Heat Adsorption: 0
0 values in void_volume: 0
0 values in void_fraction: 0, -1 value: 0
0 values in surface_area: 0, -1 value: 0

clean_X.columns == test_X.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

predict = gb.predict(test_X)

predicted_df = pd.DataFrame({"id":range(68614,68614+len(test_X)), "CO2_working_capacity [mL/g]":predict})
predicted_df

# tabnet
predict = tab.predict(np.array(test_X))

predicted_df = pd.DataFrame({"id":range(68614,68614+len(test_X)), "CO2_working_capacity [mL/g]":predict[:,0]})
predicted_df

# conv1d
predict = model.predict(np.expand_dims(test_X, axis=-1).reshape((test_X.shape[0], 1,test_X.shape[1]))).flatten()

predicted_df = pd.DataFrame({"id":range(68614,68614+len(test_X)), "CO2_working_capacity [mL/g]":predict})
predicted_df

# tf
predict = model.predict(test_X)

predicted_list = []
for i in range(len(predict)):
    predicted_list.append([f"pretest_{i+1}",predict[i][0]])
predicted_df = pd.DataFrame(data=predicted_list,columns=["Id","CO2"])
predicted_df

predicted_df.to_csv("output/phase2/submission.csv",index=False)

gb3_importance = pd.Series(gb.feature_importances_, index=clean_X.columns)
gb3_importance.sort_values(ascending=False).to_frame().rename(columns={0:'Importance'}).head(25)

lgb_importance = pd.Series(lgb.feature_importances_, index=clean_X.columns)
lgb_importance.sort_values(ascending=False).to_frame().rename(columns={0:'Importance'}).head(25)

mlp_best = pd.read_csv('output/phase2/best_predicts/mlp_best.csv')
lgb_best = pd.read_csv('output/phase2/best_predicts/lgb_best.csv')
gb_best = pd.read_csv('output/phase2/best_predicts/gb_best.csv')

tab_best = pd.read_csv('output/phase2/best_predicts/tab_best.csv')
di_cnn_best = pd.read_csv('output/phase2/best_predicts/di_cnn_best.csv')
combined_cnn_best = pd.read_csv('output/phase2/best_predicts/combined_cnn_best.csv')

ensemble = mlp_best[['id']]

ensemble['CO2_working_capacity [mL/g]'] = (0.34 * lgb_best['CO2_working_capacity [mL/g]']) + \
                                          (0.33 * mlp_best['CO2_working_capacity [mL/g]']) + \
                                          (0.33 * gb_best['CO2_working_capacity [mL/g]'])

ensemble

ensemble.to_csv('output/phase2/submission.csv', index=False)

# best_mlp = joblib.load('models/mlp_current_best_impute.joblib')
# best_gb = joblib.load('models/gb_best_model.joblib')
# best_lgb = joblib.load('models/lgb_current_best.joblib')

# mlp_predict = best_mlp.predict(test_X)
# gb_predict = best_gb.predict(test_X)
# lgb_predict = best_lgb.predict(test_X)

# weighted average
# mean_ensemble = (0.7 * mlp_predict) + (0.05 * gb_predict) + (0.25 * lgb_predict)

# mean_ensemble

# ensemble_df = pd.DataFrame({"id":range(68614,68614+len(test_X)), "CO2_working_capacity [mL/g]":mean_ensemble})
# ensemble_df

# ensemble_df.to_csv('output/phase2/submission.csv', index=False)

ensemble_df_2nd = ensemble.copy()

ensemble_df_2nd['CO2_working_capacity [mL/g]'] = (0.7*ensemble['CO2_working_capacity [mL/g]']) + \
                                                 (0.3*lgb_best['CO2_working_capacity [mL/g]'])

ensemble_df_2nd.to_csv('output/phase2/submission.csv', index=False)

best = pd.read_csv('output/phase2/best_predicts/first_iter_best.csv')
second_best = pd.read_csv('output/phase2/best_predicts/second_best.csv')

best_ensemble = best[['id']]

best_ensemble['CO2_working_capacity [mL/g]'] = (0.5*best['CO2_working_capacity [mL/g]']) + \
                                               (0.5*second_best['CO2_working_capacity [mL/g]'])

best_ensemble.to_csv('output/phase2/submission.csv', index=False)

	MOFname	volume [A^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	functional_groups	metal_linker	organic_linker1	organic_linker2	...	_cell_length_c	_cell_angle_alpha	_cell_angle_beta	_cell_angle_gamma	n_atoms	mol_avg_mass	charges	mol_avg_radius	atoms_volume	atoms_area
0	mof_unit_1	1116.667429	875.240600	0.00	0.07899	0.0607	COOH-OEt	3	4	11	...	9.890832	1.569125	1.592480	1.575368	75	11.669907	92	59.653333	85.488522	359.334171
1	mof_unit_2	2769.503842	2211.697211	603.61	0.13794	0.1040	F-OMe	10	44	57	...	18.960098	1.746437	1.602488	1.691961	194	11.400559	250	60.515464	318.194213	1020.568686
2	mof_unit_3	1089.818728	773.687960	788.50	0.14874	0.1262	OMe-COOH	2	22	24	...	10.631996	1.556872	1.569806	1.577559	82	9.435293	120	59.585366	92.531908	390.662171
3	mof_unit_4	2205.198301	1304.638720	1441.53	0.21814	0.2220	H-SO3H	9	17	24	...	19.274980	1.911789	1.574891	1.580099	112	11.648598	204	63.500000	185.375000	638.296686
4	mof_unit_5	1137.800963	901.736120	0.00	0.07778	0.0591	NHMe-OH	2	1	22	...	10.853274	1.565467	1.622999	1.595312	94	9.593000	90	58.425532	98.775910	428.741029
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68608	mof_unit_68609	1188.302573	1001.700216	0.00	0.00000	0.0000	Pr-F	3	4	24	...	10.193870	1.585497	1.609910	1.583947	119	8.417773	216	59.277311	124.277288	552.300571
68609	mof_unit_68610	1506.660363	1493.296496	0.00	0.01108	0.0000	SO3H	10	42	46	...	15.033794	1.661287	1.730445	1.700483	126	11.851548	126	62.460317	261.730396	736.477029
68610	mof_unit_68611	2035.532738	1959.518320	0.00	0.00000	0.0000	OPr	4	14	22	...	18.608120	1.574297	1.572863	1.034849	204	9.605572	366	61.578431	289.412179	1072.591771
68611	mof_unit_68612	3985.426053	3638.677280	0.00	0.00000	0.0000	OPr-Me	4	4	15	...	18.544746	1.578949	1.585477	1.569257	364	9.996454	652	62.054945	541.832410	1958.553143
68612	mof_unit_68613	1591.009408	2071.219000	0.00	0.01609	0.0000	I-OEt	2	9	16	...	13.706241	2.068063	1.280517	1.711999	116	17.855463	222	65.155172	176.607141	671.663771

	volume [A^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	organic_linker2	CO2/N2_selectivity	heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]	n_atoms	mol_avg_mass	charges	mol_avg_radius	atoms_volume	atoms_area	CO2_working_capacity [mL/g]
count	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	6.652600e+04	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000	68613.000000
mean	3447.363207	1656.761858	1666.766690	0.259164	0.376052	4.203271	11.919257	20.583592	28.599681	inf	140.665253	11.990179	220.989055	61.988885	193.150420	739.177366	120.002797
std	4840.665782	1259.086320	1366.317223	0.164758	0.476452	3.144905	10.783136	10.100870	153.806887	NaN	101.953227	3.010884	178.118021	2.017855	141.562607	532.277329	89.573112
min	606.576038	439.281220	-1.000000	-1.000000	0.000000	1.000000	1.000000	1.000000	0.000000	1.612299e+00	26.000000	6.507769	-344.000000	55.209302	42.584658	146.371657	-44.285746
25%	1556.075767	893.662700	511.930000	0.142580	0.119000	2.000000	4.000000	14.000000	12.818366	5.267536e+00	78.000000	10.153402	114.000000	60.666667	96.954142	399.807886	65.537205
50%	2190.442847	1259.699253	1542.830000	0.242620	0.248400	3.000000	10.000000	20.000000	19.689890	5.899089e+00	110.000000	11.369379	174.000000	61.720430	141.888015	569.976000	98.552185
75%	3605.836441	1897.163660	2517.960000	0.345120	0.443800	4.000000	16.000000	25.000000	32.954388	6.768365e+00	164.000000	12.979880	266.000000	62.956522	238.156101	871.466514	163.139540
max	223964.854408	22595.928960	7083.530000	0.872060	6.610100	12.000000	59.000000	59.000000	29369.777780	inf	1776.000000	49.464101	4128.000000	80.238095	2471.014455	9893.392457	736.061636

	MOFname	volume [A^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	organic_linker2	topology	CO2/N2_selectivity	heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]	n_atoms	mol_avg_mass	charges	mol_avg_radius	atoms_volume	atoms_area	CO2_working_capacity [mL/g]
0	mof_unit_1	1116.667429	875.240600	0.00	0.07899	0.0607	3	4	11	pcu	22.864166	6.786041	75	11.669907	92	59.653333	85.488522	359.334171	105.284502
1	mof_unit_2	2769.503842	2211.697211	603.61	0.13794	0.1040	10	44	57	etb	33.616780	7.147286	194	11.400559	250	60.515464	318.194213	1020.568686	101.224774
2	mof_unit_3	1089.818728	773.687960	788.50	0.14874	0.1262	2	22	24	pcu	19.263726	6.347967	82	9.435293	120	59.585366	92.531908	390.662171	118.987011
3	mof_unit_4	2205.198301	1304.638720	1441.53	0.21814	0.2220	9	17	24	sra	25.701377	6.190085	112	11.648598	204	63.500000	185.375000	638.296686	187.626004
4	mof_unit_5	1137.800963	901.736120	0.00	0.07778	0.0591	2	1	22	pcu	30.001838	6.478063	94	9.593000	90	58.425532	98.775910	428.741029	79.210001
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68608	mof_unit_68609	1188.302573	1001.700216	0.00	0.00000	0.0000	3	4	24	pcu	24.131770	NaN	119	8.417773	216	59.277311	124.277288	552.300571	-12.943652
68609	mof_unit_68610	1506.660363	1493.296496	0.00	0.01108	0.0000	10	42	46	etb	6.071818	NaN	126	11.851548	126	62.460317	261.730396	736.477029	-12.985582
68610	mof_unit_68611	2035.532738	1959.518320	0.00	0.00000	0.0000	4	14	22	acs	9.876134	NaN	204	9.605572	366	61.578431	289.412179	1072.591771	-13.187635
68611	mof_unit_68612	3985.426053	3638.677280	0.00	0.00000	0.0000	4	4	15	acs	5.285051	inf	364	9.996454	652	62.054945	541.832410	1958.553143	15.672698
68612	mof_unit_68613	1591.009408	2071.219000	0.00	0.01609	0.0000	2	9	16	pcu	2.621272	inf	116	17.855463	222	65.155172	176.607141	671.663771	3.144708

	MOFname	volume [A^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	organic_linker2	topology	CO2/N2_selectivity	heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]	n_atoms	mol_avg_mass	charges	mol_avg_radius	atoms_volume	atoms_area	CO2_working_capacity [mL/g]
0	mof_unit_1	1116.667429	875.240600	0.00	0.07899	0.0607	3	4	11	pcu	22.864166	6.786041	75	11.669907	92	59.653333	85.488522	359.334171	105.284502
1	mof_unit_2	2769.503842	2211.697211	603.61	0.13794	0.1040	10	44	57	etb	33.616780	7.147286	194	11.400559	250	60.515464	318.194213	1020.568686	101.224774
2	mof_unit_3	1089.818728	773.687960	788.50	0.14874	0.1262	2	22	24	pcu	19.263726	6.347967	82	9.435293	120	59.585366	92.531908	390.662171	118.987011
3	mof_unit_4	2205.198301	1304.638720	1441.53	0.21814	0.2220	9	17	24	sra	25.701377	6.190085	112	11.648598	204	63.500000	185.375000	638.296686	187.626004
4	mof_unit_5	1137.800963	901.736120	0.00	0.07778	0.0591	2	1	22	pcu	30.001838	6.478063	94	9.593000	90	58.425532	98.775910	428.741029	79.210001
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68608	mof_unit_68609	1188.302573	1001.700216	0.00	0.00000	0.0000	3	4	24	pcu	24.131770	NaN	119	8.417773	216	59.277311	124.277288	552.300571	-12.943652
68609	mof_unit_68610	1506.660363	1493.296496	0.00	0.01108	0.0000	10	42	46	etb	6.071818	NaN	126	11.851548	126	62.460317	261.730396	736.477029	-12.985582
68610	mof_unit_68611	2035.532738	1959.518320	0.00	0.00000	0.0000	4	14	22	acs	9.876134	NaN	204	9.605572	366	61.578431	289.412179	1072.591771	-13.187635
68611	mof_unit_68612	3985.426053	3638.677280	0.00	0.00000	0.0000	4	4	15	acs	5.285051	inf	364	9.996454	652	62.054945	541.832410	1958.553143	15.672698
68612	mof_unit_68613	1591.009408	2071.219000	0.00	0.01609	0.0000	2	9	16	pcu	2.621272	inf	116	17.855463	222	65.155172	176.607141	671.663771	3.144708

	di_cnn_1	di_cnn_2	di_cnn_4	di_cnn_5	di_cnn_6	di_cnn_8	di_cnn_10	di_cnn_12	di_cnn_13	di_cnn_14	di_cnn_15
0	21.167830	30.934912	0.0	21.876890	30.938911	0.000000	0.000000	0.0	0.0	24.025457	0.0
1	23.965372	35.462387	0.0	25.400830	35.742040	0.000000	0.000000	0.0	0.0	26.279602	0.0
2	25.874134	39.247130	0.0	27.537067	39.353320	0.000000	5.338663	0.0	0.0	26.913395	0.0
3	28.681208	44.552822	0.0	31.782713	44.529415	0.000000	20.764702	0.0	0.0	29.767157	0.0
4	21.825914	31.975190	0.0	22.386007	31.988407	0.000000	0.000000	0.0	0.0	24.455626	0.0
...	...	...	...	...	...	...	...	...	...	...	...
68597	0.000000	0.000000	0.0	0.000000	0.000000	2.536157	0.000000	0.0	0.0	0.000000	0.0
68598	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.000000	0.0
68599	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.000000	0.0
68600	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.000000	0.0
68601	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.000000	0.0

Thailand Machine Learning for Chemistry Competition (TMLCC)¶

Note on feature engineering & techniques¶

Notes feats from di_cnn¶

Clean Data¶

Engineer new features¶

Drop some serious outliers¶

Add new feats from DeepInsight CNN¶

Density¶

Fill 200 rows that void_volume is 0 and void_fraction is not 0 or -1 with formula " void_vol = void_fraction / density "¶

Important feature errors note:¶

Fill void_vol¶

Fill void_fraction with formula " void_fraction = density * void_vol "¶

Impute surface_area¶

Find MIN of heat absorption in inf value¶

Let's try fill heat with min¶

specific heat¶

Prepare X, y for training¶

columns to one-hot encode¶

Feature scaling¶

Notes¶

Load params from optuna¶

Gadient Boosting¶

MLP¶

LightGBM¶

TabNet¶

SVM¶

XGBoost¶

RandomForest¶

Tensorflow NN¶

1D-CNN¶

Submission¶

Ensemble¶

2nd iter ensemble¶

Best ensemble¶

	MOFname	volume [A^3]	density [g/cm^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	organic_linker2	...	di_cnn_8	di_cnn_10	di_cnn_12	di_cnn_13	di_cnn_14	di_cnn_15	comb_cnn_0	comb_cnn_1	comb_cnn_2	comb_cnn_3
0	mof_unit_1	1116.667429	1.301526	875.240600	0.00	0.07899	0.0607	3	4	11	...	0.000000	0.000000	0.0	0.0	24.025457	0.0	0.000000	0.000000	0.000000	58.904250
1	mof_unit_2	2769.503842	1.326090	2211.697211	603.61	0.13794	0.1040	10	44	57	...	0.000000	0.000000	0.0	0.0	26.279602	0.0	0.000000	0.000000	0.000000	69.381874
2	mof_unit_3	1089.818728	1.178856	773.687960	788.50	0.14874	0.1262	2	22	24	...	0.000000	5.338663	0.0	0.0	26.913395	0.0	0.000000	0.000000	7.915159	82.243230
3	mof_unit_4	2205.198301	0.982408	1304.638720	1441.53	0.21814	0.2220	9	17	24	...	0.000000	20.764702	0.0	0.0	29.767157	0.0	0.000000	0.000000	0.000000	117.579170
4	mof_unit_5	1137.800963	1.316020	901.736120	0.00	0.07778	0.0591	2	1	22	...	0.000000	0.000000	0.0	0.0	24.455626	0.0	0.000000	0.000000	0.000000	61.207540
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68597	mof_unit_68609	1188.302573	1.399781	1001.700216	0.00	0.00000	0.0000	3	4	24	...	2.536157	0.000000	0.0	0.0	0.000000	0.0	0.716291	7.768239	5.032241	8.482945
68598	mof_unit_68610	1506.660363	1.645811	1493.296496	0.00	0.01108	0.0000	10	42	46	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.974198	8.142911	5.304706	7.986442
68599	mof_unit_68611	2035.532738	1.598529	1959.518320	0.00	0.00000	0.0000	4	14	22	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.778554	7.931119	5.137025	8.463214
68600	mof_unit_68612	3985.426053	1.516066	3638.677280	0.00	0.00000	0.0000	4	4	15	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.799744	7.906666	5.131299	8.344973
68601	mof_unit_68613	1591.009408	2.161736	2071.219000	0.00	0.01609	0.0000	2	9	16	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	1.005248	6.179784	3.833471	5.429558

	MOFname	volume [A^3]	density [g/cm^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	organic_linker2	...	di_cnn_8	di_cnn_10	di_cnn_12	di_cnn_13	di_cnn_14	di_cnn_15	comb_cnn_0	comb_cnn_1	comb_cnn_2	comb_cnn_3
615	mof_unit_616	2986.257706	1.291033	2321.749040	0.0	0.00286	0.0	9	18	29	...	0.0	0.0	0.0	0.0	28.976114	0.000000	0.000000	4.749188	0.000000	119.355064
5772	mof_unit_5773	2208.884643	1.701815	2263.789760	0.0	0.00815	0.0	9	16	18	...	0.0	0.0	0.0	0.0	28.084759	0.000000	0.000000	5.553674	0.000000	85.663025
6631	mof_unit_6632	1595.944227	1.502962	1444.496040	0.0	0.01300	0.0	9	22	24	...	0.0	0.0	0.0	0.0	27.241444	0.000000	0.000000	3.601681	0.000000	70.222820
8227	mof_unit_8228	1686.566488	1.321990	1342.710140	0.0	0.00789	0.0	9	11	2	...	0.0	0.0	0.0	0.0	27.770300	0.000000	0.000000	2.410116	0.000000	78.291700
9625	mof_unit_9626	3951.837174	1.621170	3858.143280	0.0	0.01559	0.0	4	1	24	...	0.0	0.0	0.0	0.0	26.016468	1.369474	0.000000	1.356497	0.000000	56.455738
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68578	mof_unit_68590	2981.283155	1.464359	2629.066240	0.0	0.00486	0.0	9	18	29	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.337637	9.405468	6.199437	11.978576
68582	mof_unit_68594	1524.505449	1.835401	1685.041936	0.0	0.00699	0.0	10	46	48	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.903412	8.181130	5.331585	8.300731
68584	mof_unit_68596	3760.489888	1.476275	3343.200672	0.0	0.01221	0.0	10	42	42	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.951341	7.939940	5.150652	7.821176
68598	mof_unit_68610	1506.660363	1.645811	1493.296496	0.0	0.01108	0.0	10	42	46	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.974198	8.142911	5.304706	7.986442
68601	mof_unit_68613	1591.009408	2.161736	2071.219000	0.0	0.01609	0.0	2	9	16	...	0.0	0.0	0.0	0.0	0.000000	0.000000	1.005248	6.179784	3.833471	5.429558

	MOFname	volume [A^3]	volume [cm^3/g]	density [g/cm^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	...	di_cnn_8	di_cnn_10	di_cnn_12	di_cnn_13	di_cnn_14	di_cnn_15	comb_cnn_0	comb_cnn_1	comb_cnn_2	comb_cnn_3
0	mof_unit_1	1116.667429	0.768329	1.301526	875.240600	0.00	0.07899	0.060700	3	4	...	0.000000	0.000000	0.0	0.0	24.025457	0.0	0.000000	0.000000	0.000000	58.904250
1	mof_unit_2	2769.503842	0.754097	1.326090	2211.697211	603.61	0.13794	0.104000	10	44	...	0.000000	0.000000	0.0	0.0	26.279602	0.0	0.000000	0.000000	0.000000	69.381874
2	mof_unit_3	1089.818728	0.848280	1.178856	773.687960	788.50	0.14874	0.126200	2	22	...	0.000000	5.338663	0.0	0.0	26.913395	0.0	0.000000	0.000000	7.915159	82.243230
3	mof_unit_4	2205.198301	1.017907	0.982408	1304.638720	1441.53	0.21814	0.222000	9	17	...	0.000000	20.764702	0.0	0.0	29.767157	0.0	0.000000	0.000000	0.000000	117.579170
4	mof_unit_5	1137.800963	0.759867	1.316020	901.736120	0.00	0.07778	0.059100	2	1	...	0.000000	0.000000	0.0	0.0	24.455626	0.0	0.000000	0.000000	0.000000	61.207540
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68597	mof_unit_68609	1188.302573	0.714398	1.399781	1001.700216	0.00	0.00000	0.000000	3	4	...	2.536157	0.000000	0.0	0.0	0.000000	0.0	0.716291	7.768239	5.032241	8.482945
68598	mof_unit_68610	1506.660363	0.607603	1.645811	1493.296496	0.00	0.01108	0.006732	10	42	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.974198	8.142911	5.304706	7.986442
68599	mof_unit_68611	2035.532738	0.625575	1.598529	1959.518320	0.00	0.00000	0.000000	4	14	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.778554	7.931119	5.137025	8.463214
68600	mof_unit_68612	3985.426053	0.659602	1.516066	3638.677280	0.00	0.00000	0.000000	4	4	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.799744	7.906666	5.131299	8.344973
68601	mof_unit_68613	1591.009408	0.462591	2.161736	2071.219000	0.00	0.01609	0.007443	2	9	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	1.005248	6.179784	3.833471	5.429558

	MOFname	volume [A^3]	volume [cm^3/g]	density [g/cm^3]	weight [u]	surface_area [m^2/g]	void_fraction	void_volume [cm^3/g]	metal_linker	organic_linker1	...	di_cnn_8	di_cnn_10	di_cnn_12	di_cnn_13	di_cnn_14	di_cnn_15	comb_cnn_0	comb_cnn_1	comb_cnn_2	comb_cnn_3
37	mof_unit_38	810.133031	0.705430	1.417576	691.597320	0.0	0.0	0.4734	2	10	...	0.000000	1.861506	0.0	0.0	31.085001	0.0	0.000000	6.205471	0.000000	105.866270
53	mof_unit_54	1136.093302	0.717497	1.393734	953.552560	0.0	0.0	0.4734	2	9	...	0.000000	22.974613	0.0	0.0	30.284027	0.0	0.000000	6.287744	0.000000	110.978970
62	mof_unit_63	1105.832336	0.818345	1.221978	813.773143	0.0	0.0	0.4734	2	4	...	0.000000	0.000000	0.0	0.0	29.013565	0.0	0.000000	3.850498	0.000000	91.242500
152	mof_unit_153	1683.048333	0.783010	1.277123	1294.434240	0.0	0.0	0.4734	2	14	...	0.000000	0.000000	0.0	0.0	26.279400	0.0	0.000000	0.596128	0.000000	64.816280
243	mof_unit_244	1840.691522	0.789265	1.267001	1404.457740	0.0	0.0	0.4734	9	16	...	0.000000	44.439384	0.0	0.0	33.793285	0.0	0.000000	4.907324	0.000000	167.302950
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68595	mof_unit_68607	949.067112	0.758226	1.318868	753.787520	0.0	0.0	0.4734	2	10	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.941383	7.906218	5.176683	7.782713
68596	mof_unit_68608	1202.182553	0.694431	1.440028	1042.538240	0.0	0.0	0.4734	3	12	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.927152	7.960196	5.177389	7.929484
68597	mof_unit_68609	1188.302573	0.714398	1.399781	1001.700216	0.0	0.0	0.4734	3	4	...	2.536157	0.000000	0.0	0.0	0.000000	0.0	0.716291	7.768239	5.032241	8.482945
68599	mof_unit_68611	2035.532738	0.625575	1.598529	1959.518320	0.0	0.0	0.4734	4	14	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.778554	7.931119	5.137025	8.463214
68600	mof_unit_68612	3985.426053	0.659602	1.516066	3638.677280	0.0	0.0	0.4734	4	4	...	0.000000	0.000000	0.0	0.0	0.000000	0.0	0.799744	7.906666	5.131299	8.344973

	volume [cm^3/g]	weight [u]	surface_area [m^2/g]	void_volume [cm^3/g]	CO2/N2_selectivity	n_atoms	charges
0	-0.584699	-0.383085	-0.481798	0.060700	0.157631	-0.406977	-0.539474
1	-0.615547	0.949117	-0.846039	0.104000	0.691708	0.976744	0.500000
2	-0.411402	-0.484315	-0.727178	0.126200	-0.021201	-0.325581	-0.355263
3	-0.043730	0.044946	-0.307363	0.222000	0.298554	0.023256	0.197368
4	-0.603040	-0.356674	-0.632685	0.059100	0.512156	-0.186047	-0.552632
...	...	...	...	...	...	...	...
68597	-0.701596	-0.257028	-1.432939	0.473400	0.220592	0.104651	0.276316
68598	-0.933076	0.233003	-0.643068	0.006732	-0.676437	0.186047	-0.315789
68599	-0.894122	0.697741	-1.536209	0.473400	-0.487479	1.093023	1.263158
68600	-0.820367	2.371556	-1.472261	0.473400	-0.715515	2.953488	3.144737
68601	-1.247394	0.809086	-0.809562	0.007443	-0.847824	0.069767	0.315789

	volume [cm^3/g]	weight [u]	surface_area [m^2/g]	void_volume [cm^3/g]	CO2/N2_selectivity	n_atoms	charges
0	-0.584699	-0.383085	1170.192841	0.060700	0.157631	-0.406977	-0.539474
1	-0.615547	0.949117	603.610000	0.104000	0.691708	0.976744	0.500000
2	-0.411402	-0.484315	788.500000	0.126200	-0.021201	-0.325581	-0.355263
3	-0.043730	0.044946	1441.530000	0.222000	0.298554	0.023256	0.197368
4	-0.603040	-0.356674	935.486251	0.059100	0.512156	-0.186047	-0.552632
...	...	...	...	...	...	...	...
68597	-0.701596	-0.257028	-309.324898	0.473400	0.220592	0.104651	0.276316
68598	-0.933076	0.233003	919.334794	0.006732	-0.676437	0.186047	-0.315789
68599	-0.894122	0.697741	-469.964067	0.473400	-0.487479	1.093023	1.263158
68600	-0.820367	2.371556	-370.490957	0.473400	-0.715515	2.953488	3.144737
68601	-1.247394	0.809086	660.350715	0.007443	-0.847824	0.069767	0.315789

	volume	volume/g	density	weight	surface_area	void_fraction	void_volume	CO2/N2_selectivity	heat_adsorption	n_atoms	...	topology_1	topology_2	topology_3	topology_4	topology_5	topology_6	topology_7	topology_8	topology_9	topology_10
0	-0.524386	-0.584699	0.798882	-0.383085	-0.257582	-0.872930	-0.642857	0.157631	0.609280	-0.406977	...	0	0	0	0	1	0	0	0	0	0
1	0.282926	-0.615547	0.856904	0.949117	-0.604123	-0.588382	-0.513449	0.691708	0.845745	0.976744	...	0	1	0	0	0	0	0	0	0	0
2	-0.537500	-0.411402	0.509126	-0.484315	-0.491038	-0.536250	-0.447101	-0.021201	0.322523	-0.325581	...	0	0	0	0	1	0	0	0	0	0
3	0.007296	-0.043730	0.045099	0.044946	-0.091623	-0.201260	-0.160789	0.298554	0.219176	0.023256	...	0	0	0	0	0	0	0	1	0	0
4	-0.514063	-0.603040	0.833117	-0.356674	-0.401136	-0.878771	-0.647639	0.512156	0.407682	-0.186047	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
68597	-0.489396	-0.701596	1.030967	-0.257028	-1.162504	1.944400	0.590556	0.220592	-2.777374	0.104651	...	0	0	0	0	1	0	0	0	0	0
68598	-0.333898	-0.933076	1.612110	0.233003	-0.411015	-1.200729	-0.804148	-0.676437	-2.777374	0.186047	...	0	1	0	0	0	0	0	0	0	0
68599	-0.075575	-0.894122	1.500426	0.697741	-1.260756	2.398555	0.590556	-0.487479	-2.777374	1.093023	...	0	0	0	0	0	0	0	0	0	0
68600	0.876831	-0.820367	1.305642	2.371556	-1.199915	2.210121	0.590556	-0.715515	-2.777374	2.953488	...	0	0	0	0	0	0	0	0	0	0
68601	-0.292698	-1.247394	2.830765	0.809086	-0.569418	-1.176546	-0.802023	-0.847824	-2.777374	0.069767	...	0	0	0	0	1	0	0	0	0	0

	volume	volume/g	density	weight	surface_area	void_fraction	void_volume	CO2/N2_selectivity	heat_adsorption	n_atoms	...	topology_1	topology_2	topology_3	topology_4	topology_5	topology_6	topology_7	topology_8	topology_9	topology_10
0	-0.479628	-0.272893	0.314058	-0.460370	-0.614563	-0.704325	-0.513748	0.841859	0.753026	-0.302326	...	0	0	0	0	1	0	0	0	0	0
1	0.945696	-0.807447	1.273569	2.466234	-0.451899	-0.222836	-0.399283	-0.064565	-0.481679	0.558140	...	0	0	0	0	0	0	0	0	0	0
2	-0.287257	0.548750	-0.446130	-0.510708	0.956839	0.381305	0.483264	-0.329196	-0.530118	-0.581395	...	0	0	0	0	1	0	0	0	0	0
3	0.120337	0.943906	-0.672449	-0.262907	1.180306	0.698967	0.957860	-0.501135	-0.490294	-0.186047	...	0	0	0	0	1	0	0	0	0	0
4	0.455792	0.798101	-0.595773	0.077741	0.664925	0.626321	0.813210	-0.333562	-0.371552	0.093023	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
16995	14.883110	6.775274	-1.708149	3.453108	2.525318	2.492180	8.834429	-0.752689	-1.772980	3.558140	...	0	0	0	1	0	0	0	0	0	0
16996	1.407069	2.165319	-1.115874	0.238821	1.483792	1.602959	2.779438	-0.642975	-1.437719	0.162791	...	0	0	0	1	0	0	0	0	0	0
16997	1.211109	2.360628	-1.164993	0.062301	1.649838	1.398200	2.669157	-0.746253	-1.480803	0.023256	...	0	0	0	0	1	0	0	0	0	0
16998	1.217125	2.786262	-1.258839	-0.045891	1.676200	1.672998	3.387029	-0.738450	-1.571499	-0.069767	...	0	0	0	0	1	0	0	0	0	0
16999	11.484388	4.749416	-1.543962	3.522421	2.088765	2.208162	6.098326	-0.765526	-1.757809	3.604651	...	0	0	0	0	1	0	0	0	0	0

	id	CO2_working_capacity [mL/g]
0	68614	180.180567
1	68615	63.891765
2	68616	66.823295
3	68617	62.815305
4	68618	62.736945
...	...	...
16995	85609	-8.060973
16996	85610	1.188486
16997	85611	-0.403545
16998	85612	-1.161775
16999	85613	-4.178828

	id	CO2_working_capacity [mL/g]
0	68614	180.315536
1	68615	66.231522
2	68616	73.615433
3	68617	62.306282
4	68618	62.686466
...	...	...
16995	85609	-5.499111
16996	85610	-0.111550
16997	85611	-0.917235
16998	85612	-0.976805
16999	85613	-3.520573

	Importance
CO2/N2_selectivity	2028
mol_avg_mass	2016
heat_adsorption	1981
surface_area	1853
comb_cnn_3	1848
charges	1726
volume/g	1487
void_fraction	1417
di_cnn_1	1345
weight	1281
di_cnn_14	1211
di_cnn_10	1126
n_atoms	1090
void_volume	1087
volume	1006
di_cnn_12	787
di_cnn_5	743
di_cnn_2	684
di_cnn_6	534
comb_cnn_2	398
comb_cnn_1	392
density	212
metal_linker_1	145
di_cnn_15	103
metal_linker_2	83

	id	CO2_working_capacity [mL/g]
0	68614	173.741324
1	68615	64.919034
2	68616	70.031861
3	68617	63.992929
4	68618	63.005615
...	...	...
16995	85609	-7.920903
16996	85610	0.856687
16997	85611	-0.066948
16998	85612	-1.142232
16999	85613	-4.423955