A user is trying to implement the decision tree regressor algorithm on some training data but when he calls fit() he gets an error.
(trainingData, testData) = data.randomSplit([0.7, 0.3])
vecAssembler = VectorAssembler(inputCols=["_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9", "_10"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="_11")
dt_model = dt.fit(trainingData)
He receives the following error
File "spark.py", line 100, in
main()
File "spark.py", line 87, in main
dt_model = dt.fit(trainingData)
File "/opt/spark/python/pyspark/ml/base.py", line 132, in fit
return self._fit(dataset)
File "/opt/spark/python/pyspark/ml/wrapper.py", line 295, in _fit
java_model = self._fit_java(dataset)
File "/opt/spark/python/pyspark/ml/wrapper.py", line 292, in _fit_java
return self._java_obj.fit(dataset._jdf)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/spark/python/pyspark/sql/utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: u'requirement failed: Column features must be of type struct,values:array> but was actually struct,values:array>.'
The above error is because of missing transformation part and selecting features and labels from the transformed data. The below code can help to fix the issue
from pyspark.ml.feature
from pyspark.ml.classification import DecisionTreeClassifier
#date processing part
vecAssembler = VectorAssembler(input_cols=['col_1','col_2',...,'col_10'],outputCol='features')
#you missed these two steps
trans_data = vecAssembler.transform(data)
final_data = trans_data.select('features','col_11') #your label column name is col_11
train_data, test_data = final_data.randomSplit([0.7,0.3])
#ml part
dt = DecisionTreeClassifier(featuresCol='features',labelCol='col_11')
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)
#proceed with the model evaluation part after this