Detect-Track-Classify-demo(Python)

Loading...

Detect-Track-Classify with Spark

Here, we demo a feature like Amazon's X-Ray to identify actors in a video segment with Spark.

x-ray demo

import string
import random
import numpy as np
import face_recognition
from scipy.optimize import linear_sum_assignment
 
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import DataFrame, Row
from pyspark.sql.window import Window
 
from rikai.types.geometry import Box2d
from rikai.types import YouTubeVideo, Segment
from rikai.spark.types import YouTubeVideoType
from rikai.spark.types.geometry import Box2dType
from rikai.spark.functions import video_to_images

With Rikai, we can load youtube videos into spark dataframes. We can explode each video into a sampling of frames using the video_to_images udf.

# Sample dataframe of youtube clips
df = spark.createDataFrame(
        [
            ("nf8ySuesAPg", YouTubeVideo(vid="nf8ySuesAPg"), Segment(30, 120))
        ],
        ["vid", "youtube_video", "segment"],
    )
 
df = df.withColumn("video_frames", video_to_images(F.col("youtube_video"), F.col("segment"), F.lit(5))) \
          .withColumn("frame", F.explode(F.col("video_frames"))) \
          .drop("video_frames")
 
display(df)
 
vid
youtube_video
segment
frame
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_150.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_155.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_160.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_165.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_170.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_175.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_180.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_185.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_190.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_195.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_200.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_205.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_210.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_215.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_220.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_225.jpg')
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_230.jpg')

Showing all 90 rows.

Now, with face_recognition we implement a udf for face detection, returning a list of bounding boxes.

def bbox_helper(bbox):
  top, right, bottom, left = bbox
  bbox = [top, left, bottom, right]
  return list(map(lambda x: max(x, 0), bbox))
 
@udf(returnType=T.ArrayType(Box2dType()))
def face_detector(image):
  img = image.to_numpy()
  faces = face_recognition.face_locations(img)
  return [Box2d(*bbox_helper(f)) for f in faces]
df = df.withColumn("faces", face_detector(F.col("frame")))
display(df)
 
vid
youtube_video
segment
frame
faces
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_150.jpg')
["Box2d(xmin=82.0, ymin=253.0, xmax=211.0, ymax=382.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_155.jpg')
["Box2d(xmin=82.0, ymin=253.0, xmax=211.0, ymax=382.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_160.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_165.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_170.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_175.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_180.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_185.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_190.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_195.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_200.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_205.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_210.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_215.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Segment(start_fno=30, end_fno=120)
Image(uri='nf8ySuesAPg_220.jpg')
["Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)"]

Showing all 90 rows.

We create an additional id column to match bounding boxes in time.

df = df.select("vid", "youtube_video", "frame", "faces") \
       .withColumn("detections", F.explode("faces")) \
       .withColumn("detect_id", F.lit("")) \
       .withColumn("annot", F.struct([F.col("detections"), F.col("detect_id")])) \
       .groupBy("vid", "youtube_video", "frame").agg(F.collect_list("annot").alias("annots"))
display(df)
 
vid
youtube_video
frame
annots
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_345.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=253.0, xmax=211.0, ymax=382.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_160.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_480.jpg')
[{"detections": "Box2d(xmin=96.0, ymin=268.0, xmax=225.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_390.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_495.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_330.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=253.0, xmax=211.0, ymax=382.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_190.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_200.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_150.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=253.0, xmax=211.0, ymax=382.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_295.jpg')
[{"detections": "Box2d(xmin=96.0, ymin=253.0, xmax=225.0, ymax=382.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_455.jpg')
[{"detections": "Box2d(xmin=96.0, ymin=268.0, xmax=225.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_235.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_400.jpg')
[{"detections": "Box2d(xmin=96.0, ymin=282.0, xmax=225.0, ymax=411.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_410.jpg')
[{"detections": "Box2d(xmin=96.0, ymin=268.0, xmax=225.0, ymax=397.0)", "detect_id": ""}]
nf8ySuesAPg
YouTubeVideo(vid='nf8ySuesAPg')
Image(uri='nf8ySuesAPg_545.jpg')
[{"detections": "Box2d(xmin=82.0, ymin=268.0, xmax=211.0, ymax=397.0)", "detect_id": ""}]

Showing all 90 rows.

This udf uses rikai's Box2d.iou() method and scipy.linear_sum_assigment to match overlapping bounding boxes, returning a mapping.

bbox matching