Final commit, added encoding for categorical data (untested) and adde…

…d notebook to showcase some of the functionality Signed-off-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com>
py-why · Aug 10, 2023 · 3a67554 · 3a67554
1 parent 5d42502
commit 3a67554
Show file tree

Hide file tree

Showing 4 changed files with 1,090 additions and 25 deletions.
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
@@ -472,6 +472,8 @@ def __init__(self, *,
                  model_y, model_t, model_final,
                  param_list_y=None,
                  param_list_t=None,
+                 scoring_y=None,
+                 scoring_t=None,
                  scaling=False,
                  featurizer=None,
                  treatment_featurizer=None,
@@ -493,6 +495,8 @@ def __init__(self, *,
         self.scaling = scaling
         self.param_list_y = param_list_y
         self.param_list_t = param_list_t
+        self.scoring_y = scoring_y
+        self.scoring_t = scoring_t
         self.verbose = verbose
         self.cv = cv
         self.grid_folds = grid_folds
@@ -514,10 +518,10 @@ def _gen_featurizer(self):
 
     def _gen_model_y(self):  # New
         if self.model_y == 'auto':
-            model_y = SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y,
+            model_y = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_y, scoring=self.scoring_y,
                                           scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state)
         else:
-            model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y,
+            model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scoring=self.scoring_y,
                                                 scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False)
         # if self.model_y == 'auto':
         #     model_y = WeightedLassoCVWrapper(random_state=self.random_state)
@@ -527,15 +531,13 @@ def _gen_model_y(self):  # New
                                   self.linear_first_stages, self.discrete_treatment)
 
     def _gen_model_t(self):  # New
-        # import pdb
-        # pdb.set_trace()
         if self.model_t == 'auto':
             if self.discrete_treatment:
-                model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t,
+                model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scoring=self.scoring_t,
                                               scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment,
                                               n_jobs=self.n_jobs, random_state=self.random_state)
             else:
-                model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t,
+                model_t = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_t, scoring=self.scoring_t,
                                               scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment,
                                               n_jobs=self.n_jobs, random_state=self.random_state)
 

diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py
@@ -354,6 +354,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc
         self.error_score = error_score
         self.return_train_score = return_train_score
         self.is_discrete = is_discrete
+        self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly']
 
     def fit(self, X, y, *, sample_weight=None, groups=None):
         # print(groups)
@@ -400,6 +401,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
             self.best_params_ = {}
             return self
         for estimator, param_grid in zip(self.complete_estimator_list, self.param_grid_list):
+            if self.verbose:
+                if is_polynomial_pipeline(estimator):
+                    print(f"Processing estimator: {type(estimator.named_steps['linear']).__name__}")
+                else:
+                    print(f"Processing estimator: {type(estimator).__name__}")
             try:
                 if self.random_state != None:
                     if has_random_state(model=estimator):
@@ -408,8 +414,6 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
                             estimator = estimator.set_params(linear__random_state=self.random_state)
                         else:
                             estimator.set_params(random_state=self.random_state)
-                print(estimator)  # Note Delete this
-                print(param_grid)  # Note Delete this
                 # pdb.set_trace() # Note Delete this
                 temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring,
                                            n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose,
@@ -441,8 +445,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
                 # This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation.
                 warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_."
                 warnings.warn(warning_msg, category=FitFailedWarning)
-
-        self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
+        try:
+            self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
+        except Exception as e:
+            warning_msg = f"Failed for estimator {estimator} and param_grid {param_grid} with this error {e}."
+            raise Exception(warning_msg) from e
         self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_
         self.best_score_ = self._search_list[self.best_ind_].best_score_
         self.best_params_ = self._search_list[self.best_ind_].best_params_
@@ -465,14 +472,6 @@ def predict(self, X):
     def predict_proba(self, X):
         return self.best_estimator_.predict_proba(X)
 
-    def refit(self, X, y):
-        # Refits the best estimator using the entire dataset.
-        if self.best_estimator_ is None:
-            raise ValueError("No best estimator found. Please call the 'fit' method before calling 'refit'.")
-
-        self.best_estimator_.fit(X, y)
-        return self
-
 
 class GridSearchCVList(BaseEstimator):
     """ An extension of GridSearchCV that allows for passing a list of estimators each with their own

diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py
@@ -27,7 +27,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
 from sklearn.model_selection import KFold
-# from sklearn_extensions.model_selection import WeightedStratifiedKFold
+import pandas as pd
 
 
 def select_continuous_estimator(estimator_type, random_state):
@@ -57,6 +57,9 @@ def select_continuous_estimator(estimator_type, random_state):
         poly = PolynomialFeatures()
         linear = ElasticNetCV(random_state=random_state)  # Play around with precompute and tolerance
         return (Pipeline([('poly', poly), ('linear', linear)]))
+    elif estimator_type == 'weighted_lasso':
+        from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper
+        return WeightedLassoCVWrapper(random_state=random_state)
     else:
         raise ValueError(f"Unsupported estimator type: {estimator_type}")
 
@@ -278,18 +281,15 @@ def select_classification_hyperparameters(estimator):
     elif isinstance(estimator, MLPClassifier):
         return {
             'hidden_layer_sizes': [(10,), (50,), (100,)],
-            'activation': ['relu'],
-            'solver': ['adam'],
-            'alpha': [0.0001, 0.001, 0.01],
+            'alpha': [0.0001, 0.01],
             'learning_rate': ['constant', 'adaptive']
         }
     elif is_polynomial_pipeline(estimator=estimator):
         return {
             'poly__degree': [2, 3, 4],
-            'linear__Cs': [1, 10, 20],
             'linear__max_iter': [100, 200],
             'linear__penalty': ['l2'],
-            'linear__solver': ['saga', 'liblinear', 'lbfgs']
+            'linear__solver': ['saga', 'lbfgs']
         }
     else:
         warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning)
@@ -324,7 +324,7 @@ def select_regression_hyperparameters(estimator):
     elif isinstance(estimator, MLPRegressor):
         return {
             'hidden_layer_sizes': [(10,), (50,), (100,)],
-            'alpha': [0.0001, 0.001, 0.01],
+            'alpha': [0.0001, 0.01],
             'learning_rate': ['constant', 'adaptive']
         }
     elif isinstance(estimator, GradientBoostingRegressor):
@@ -775,3 +775,36 @@ def make_param_multi_task(estimator, param_grid):
     else:
         param_grid_multi = {f'estimator__{k}': v for k, v in param_grid.items()}
         return param_grid_multi
+
+
+def preprocess_and_encode(data, cat_indices=None):
+    """
+    Detects categorical columns, one-hot encodes them, and returns the preprocessed data.
+
+    Parameters:
+    - data: pandas DataFrame or numpy array
+    - cat_indices: list of column indices (or names for DataFrame) to be considered categorical
+
+    Returns:
+    - Preprocessed data in the format of the original input (DataFrame or numpy array)
+    """
+    was_numpy = False
+    if isinstance(data, np.ndarray):
+        was_numpy = True
+        data = pd.DataFrame(data)
+
+    # If cat_indices is None, detect categorical columns using object type as a heuristic
+    if cat_indices is None:
+        cat_columns = data.select_dtypes(['object']).columns.tolist()
+    else:
+        if all(isinstance(i, int) for i in cat_indices):  # if cat_indices are integer indices
+            cat_columns = data.columns[cat_indices].tolist()
+        else:  # assume cat_indices are column names
+            cat_columns = cat_indices
+
+    data_encoded = pd.get_dummies(data, columns=cat_columns)
+
+    if was_numpy:
+        return data_encoded.values
+    else:
+        return data_encoded