%sh curl -O http://download.tensorflow.org/example_images/flower_photos.tgz tar xzf flower_photos.tgz &>/dev/null
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
2 218M 2 4600k 0 0 4528k 0 0:00:49 0:00:01 0:00:48 4528k
16 218M 16 36.5M 0 0 18.1M 0 0:00:12 0:00:02 0:00:10 18.1M
62 218M 62 136M 0 0 43.7M 0 0:00:04 0:00:03 0:00:01 43.7M
100 218M 100 218M 0 0 54.6M 0 0:00:03 0:00:03 --:--:-- 54.6M
img_dir = '/tmp/flower_photos' dbutils.fs.mkdirs(img_dir) dbutils.fs.cp('file:/databricks/driver/flower_photos/tulips', img_dir + "/tulips", recurse=True) dbutils.fs.cp('file:/databricks/driver/flower_photos/daisy', img_dir + "/daisy", recurse=True) dbutils.fs.cp('file:/databricks/driver/flower_photos/LICENSE.txt', img_dir) display(dbutils.fs.ls(img_dir))
image_df.show()
+--------------------+
| image|
+--------------------+
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
|[dbfs:/tmp/flower...|
+--------------------+
from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit from sparkdl.image import imageIO tulips_df = ImageSchema.readImages(img_dir + "/tulips").withColumn("label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn(img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) tulips_train, tulips_test, _ = tulips_df.randomSplit([0.005, 0.005, 0.99]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters) daisy_train, daisy_test, _ = daisy_df.randomSplit([0.005, 0.005, 0.99]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters) train_df = tulips_train.unionAll(daisy_train) test_df = tulips_test.unionAll(daisy_test) # Under the hood, each of the partitions is fully loaded in memory, which may be expensive. # This ensure that each of the paritions has a small size. train_df = train_df.repartition(100) test_df = test_df.repartition(100)
from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) p_model = p.fit(train_df)
Deep Learning Pipelines for Apache Spark - Release 1.2.0
Deep Learning Pipelines is a new library published by Databricks to provide high-level APIs for scalable deep learning model application and transfer learning via integration of popular deep learning libraries with MLlib Pipelines and Spark SQL. For an overview and the philosophy behind the library, check out the Databricks blog post. This notebook parallels the Deep Learning Pipelines README, detailing usage examples with additional tips for getting started with the library on Databricks.
Last refresh: Never