This tutorial covers machine learning with MLlib in Apache Spark.
MLlib is Spark's machine learning library.
Extract features from raw data.
Transform features to improve model performance.
Combines multiple columns into a single vector column.
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
inputCols=["feature1", "feature2"],
outputCol="features")
df = assembler.transform(df)
Scales features to have zero mean and unit variance.
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(
inputCol="features",
outputCol="scaledFeatures",
withStd=True, withMean=True)
scalerModel = scaler.fit(df)
df = scalerModel.transform(df)
The Pipeline API allows you to chain multiple transformations and estimators together.
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[assembler, scaler])
model = pipeline.fit(df)
df = model.transform(df)
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(
featuresCol="scaledFeatures",
labelCol="label",
maxIter=10)
lrModel = lr.fit(df)
predictions = lrModel.transform(df)
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(
featuresCol="scaledFeatures",
labelCol="label")
dtModel = dt.fit(df)
predictions = dtModel.transform(df)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(
featuresCol="scaledFeatures",
labelCol="label")
rfModel = rf.fit(df)
predictions = rfModel.transform(df)
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(
featuresCol="scaledFeatures",
labelCol="label",
maxIter=10)
lrModel = lr.fit(df)
predictions = lrModel.transform(df)
from pyspark.ml.clustering import KMeans
kmeans = KMeans(
featuresCol="scaledFeatures",
k=2)
model = kmeans.fit(df)
predictions = model.transform(df)
from pyspark.ml.feature import PCA
pca = PCA(
inputCol="scaledFeatures",
k=2)
model = pca.fit(df)
result = model.transform(df)