#import libraries import pandas as pd import numpy as np import joblib import warnings warnings.filterwarnings('ignore') from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score #importing the dataset df= pd.read_csv('clean_water_requirement_data.csv') print(df.shape) #finding the categorical and numerical columns categorical_columns = df.select_dtypes(include=['object', 'category']).columns numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns #encoding the categorical variables from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() for col in categorical_columns: df[col] = label_encoder.fit_transform(df[col]) #separating input and target features in dataset X = df.drop(columns=['water_requirement']) y=df.iloc[:,-1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) from sklearn.preprocessing import StandardScaler # Apply StandardScaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Initialize the model with the provided best parameters model = RandomForestRegressor(max_depth=20, n_estimators=200, random_state=42) # Fit the model model.fit(X_train, y_train) # Predictions y_pred = model.predict(X_test) # Calculate performance metrics mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) # Save the model as a .pkl file using joblib model_filename = 'random_forest_regressor_model.pkl' joblib.dump(model, model_filename)