How to Record, Detect Faces, Overlay Video at Real-time Using Swift?

This post will show the way how to record, detect faces, overlay video at real-time using AVFoundation via Swift3:

1. Create your recorder view:

import AVFoundation
import ImageIO

final class RecorderView: UIView {
  //Your next steps will be implemented here :]
}

2. Create camera:

Let create some global variables for your camera inside your recorder view:

final class RecorderView: UIView {
  fileprivate lazy var cameraSession = AVCaptureSession()
  fileprivate lazy var videoDataOutput = AVCaptureVideoDataOutput()
  fileprivate lazy var audioDataOutput = AVCaptureAudioDataOutput()
}

Now let’s setup your camera:

final class RecorderView: UIView {
  //...
  
  fileprivate func setupCamera() {
      //The size of output video will be 720x1280
      cameraSession.sessionPreset = AVCaptureSessionPreset1280x720
      
      //Setup your camera
      //Detect which type of camera should be used via `isUsingFrontFacingCamera`
      let captureDevice: AVCaptureDevice
      if isUsingFrontFacingCamera {
          captureDevice = AVCaptureDevice.devices(withMediaType: AVMediaTypeVideo)
                                         .flatMap { $0 as? AVCaptureDevice }
                                         .find(where: { $0.position == .front }) ?? AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeVideo)
      } else {
          captureDevice = AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeVideo)
      }
      
      //Setup your microphone 
      let audioDevice = AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeAudio)
      
      do {
          cameraSession.beginConfiguration()
              
          // Add camera to your session
          let deviceInput = try AVCaptureDeviceInput(device: captureDevice)
          if cameraSession.canAddInput(deviceInput) {
              cameraSession.addInput(deviceInput)
          }
      
          // Add microphone to your session
          let audioInput = try AVCaptureDeviceInput(device: audioDevice)
          if cameraSession.canAddInput(audioInput) {
              cameraSession.addInput(audioInput)
          }
          
          //Now we should define your output data
          let queue = DispatchQueue(label: "com.hilaoinc.hilao.queue.record-video.data-output")
          
          //Define your video output
          videoDataOutput.videoSettings = [
              kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
          ]
          videoDataOutput.alwaysDiscardsLateVideoFrames = true
          if cameraSession.canAddOutput(videoDataOutput) {
              videoDataOutput.setSampleBufferDelegate(self, queue: queue)
              cameraSession.addOutput(videoDataOutput)
          }
          
          //Define your audio output
          if cameraSession.canAddOutput(audioDataOutput) {
              audioDataOutput.setSampleBufferDelegate(self, queue: queue)
              cameraSession.addOutput(audioDataOutput)
          }
          
          cameraSession.commitConfiguration()
          
          //Present the preview of video 
          previewLayer = AVCaptureVideoPreviewLayer(session: cameraSession)
          previewLayer.bounds = bounds
          previewLayer.videoGravity = AVLayerVideoGravityResizeAspectFill
          layer.addSublayer(previewLayer)
          
          //Don't forget start running your session
          //this doesn't mean start record!
          cameraSession.startRunning()
      }
      catch let error {
          debugPrint(error.localizedDescription)
      }
  }
}

Let’s implement AVCaptureVideoDataOutputSampleBufferDelegate and AVCaptureAudioDataOutputSampleBufferDelegate to handles your recorded video.

// MARK: - AVCaptureVideoDataOutputSampleBufferDelegate, AVCaptureVideoDataOutputSampleBufferDelegate

extension RecorderView: AVCaptureVideoDataOutputSampleBufferDelegate,
                      AVCaptureVideoDataOutputSampleBufferDelegate {
  //There is only one same method for both of these delegates
  func captureOutput(_ captureOutput: AVCaptureOutput!,
                       didOutputSampleBuffer sampleBuffer: CMSampleBuffer!,
                       from connection: AVCaptureConnection!) {
      //The detect faces, overlay video will happen here, take care my man :]
  }
}

3. Real-time detect faces:

We’ll using CIDetector to detect faces from a CIImage, let’s create one global detector:

final class RecorderView: UIView {
  //...
  fileprivate lazy var faceDetector = CIDetector(ofType: CIDetectorTypeFace,
                                                 context: nil,
                                                 options: [
                                                      CIDetectorAccuracy: CIDetectorAccuracyHigh,
                                                      CIDetectorTracking: true
                                                 ])!
  //...
}

Now back to this method func captureOutput(_:didOutputSampleBuffer:from:)

func captureOutput(_ captureOutput: AVCaptureOutput!,
                   didOutputSampleBuffer sampleBuffer: CMSampleBuffer!,
                   from connection: AVCaptureConnection!) {
  //Overlay means processing on the images, not audio
  if captureOutput == videoDataOutput {
      //Important: Correct your video orientation from your device orientation
      switch UIDevice.current.orientation {
          case .landscapeRight:
              connection.videoOrientation = .landscapeLeft
          case .landscapeLeft:
              connection.videoOrientation = .landscapeRight
          case .portrait:
              connection.videoOrientation = .portrait
          case .portraitUpsideDown:
              connection.videoOrientation = .portraitUpsideDown
          default:
              connection.videoOrientation = .portrait //Make `.portrait` as default (should check will `.faceUp` and `.faceDown`)
      }
      
      //Convert current frame to `CIImage`
      let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)!
      let attachments = CMCopyDictionaryOfAttachments(kCFAllocatorDefault,
                                                      pixelBuffer,
                                                      CMAttachmentMode(kCMAttachmentMode_ShouldPropagate)) as? [String: Any]
      let ciImage = CIImage(cvImageBuffer: pixelBuffer, options: attachments)
      
      //Detects faces base on your `ciImage`
      let features = faceDetector.features(in: ciImage, options: [
                CIDetectorSmile: true,
                CIDetectorEyeBlink: true,
                ]).flatMap ({ $0 as? CIFaceFeature })

      //Retreive frame of your buffer
      let desc = CMSampleBufferGetFormatDescription(sampleBuffer)!
      let bufferFrame = CMVideoFormatDescriptionGetCleanAperture(desc, false)

      //Draw faces masks
      DispatchQueue.main.async { [weak self] in
          self?.drawFaceMasksFor(features: features, bufferFrame: bufferFrame)
      }
  }
}

Show your faces on the screen:

extension RecorderView {
  fileprivate func drawFaceMasksFor(features: [CIFaceFeature], bufferFrame: CGRect) {
        CATransaction.begin()
        CATransaction.setValue(kCFBooleanTrue, forKey: kCATransactionDisableActions)

        //Hide all current masks
        layer.sublayers?.filter({ $0.name == "MaskFace" }).forEach { $0.isHidden = true }

        //Do nothing if no face is dected
        guard !features.isEmpty else {
            CATransaction.commit()
            return
        }

        //The problem is we detect the faces on video image size
        //but when we show on the screen which might smaller or bigger than your video size
        //so we need to re-calculate the faces bounds to fit to your screen

        let xScale = frame.width / bufferFrame.width
        let yScale = frame.height / bufferFrame.height
        let transform = CGAffineTransform(rotationAngle: .pi).translatedBy(x: -bufferFrame.width,
                                                                         y: -bufferFrame.height)

      for feature in features {
          var faceRect = feature.bounds.applying(transform)
          faceRect = CGRect(x: faceRect.minX * xScale,
                            y: faceRect.minY * yScale,
                            width: faceRect.width * xScale,
                            height: faceRect.height * yScale)
          
          //Reuse the face's layer
          let faceLayer = layer.sublayers?
                               .filter { $0.name == "MaskFace" && $0.isHidden == true }
                               .first
          if faceLayer == nil {
              //Add an image as a mask to your project with name: `face-imaged   
              let faceImage = UIImage(named: "face-imaged")
              faceLayer = CALayer()
              faceLayer.contents = faceImage.ciImage
              faceLayer.frame = faceRect
              faceLayer.masksToBounds = true
              faceLayer.contentsGravity = kCAGravityResizeAspectFill
              layer.addSublayer(faceLayer)
          } else {
              faceLayer?.frame = faceRect
              faceLayer?.position = faceRect.origin
              faceLayer?.isHidden = false
          }
          
          //You can add some masks for your left eye, right eye, mouth 
      }
      CATransaction.commit()
  }
}

Now please run your project and see the result, don’t forget add your RecorderView into your view controller.

4. Record video

At those first steps we already detect real-time faces, now we will record a video.

To write down some buffers as a video file we need to touch on AVAssetWriter

//...
fileprivate(set) lazy var isRecording = false
fileprivate var videoWriter: AVAssetWriter!
fileprivate var videoWriterInput: AVAssetWriterInput!
fileprivate var audioWriterInput: AVAssetWriterInput!
fileprivate var sessionAtSourceTime: CMTime?
//...

Setup asset writer: Whenever you want to record new video, you have to re-setup the writer again!

fileprivate func setupWriter() {
  do {
      let url = AssetUtils.outputAssetURL(mediaType: .video)
      videoWriter = try AVAssetWriter(url: url, fileType: AVFileTypeMPEG4)
      
      //Add video input
      videoWriterInput = AVAssetWriterInput(mediaType: AVMediaTypeVideo, outputSettings: [
              AVVideoCodecKey: AVVideoCodecH264,
              AVVideoWidthKey: 720,
              AVVideoHeightKey: 1280,
              AVVideoCompressionPropertiesKey: [
                  AVVideoAverageBitRateKey: 2300000,
              ],
          ])
      videoWriterInput.expectsMediaDataInRealTime = true //Make sure we are exporting data at realtime
      if videoWriter.canAdd(videoWriterInput) {
          videoWriter.add(videoWriterInput)
      }
      
      //Add audio input 
      audioWriterInput = AVAssetWriterInput(mediaType: AVMediaTypeAudio, outputSettings: [
              AVFormatIDKey: kAudioFormatMPEG4AAC,
              AVNumberOfChannelsKey: 1,
              AVSampleRateKey: 44100,
              AVEncoderBitRateKey: 64000,
          ])
      audioWriterInput.expectsMediaDataInRealTime = true
      if videoWriter.canAdd(audioWriterInput) {
          videoWriter.add(audioWriterInput)
      }
      
      videoWriter.startWriting() //Means ready to write down the file
  }
  catch let error {
      debugPrint(error.localizedDescription)
  }
}

Check the writability:

extension RecorderView {
  fileprivate func canWrite() -> Bool {
      return isRecording
          && videoWriter != nil
          && videoWriter.status == .writing
  }
}

Write down buffer to the writer, comes back to method: func captureOutput(_:didOutputSampleBuffer:from:) and do some updates:

func captureOutput(_ captureOutput: AVCaptureOutput!,
                   didOutputSampleBuffer sampleBuffer: CMSampleBuffer!,
                   from connection: AVCaptureConnection!) {
  guard captureOutput != nil,
            sampleBuffer != nil,
            connection != nil,
            CMSampleBufferDataIsReady(sampleBuffer) else { return }
  
  let writable = canWrite()
  
  if writable,
       sessionAtSourceTime == nil {
      //Start writing
      sessionAtSourceTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
      videoWriter.startSession(atSourceTime: sessionAtSourceTime!)
    }

  if captureOutput == videoDataOutput {
      ... //Your old code when make the overlay here
      
      if videoWriterInput.isReadyForMoreMediaData {
          //Write video buffer
          videoWriterInput.append(sampleBuffer)
      }
  } else if writable,
      captureOutput == audioDataOutput,
      audioWriterInput.isReadyForMoreMediaData {
      //Write audio buffer
      audioWriterInput.append(sampleBuffer)
  }
}

To start recording:

extension RecorderView {
  func start() {
      guard !isRecording else { return }
      isRecording = true
      sessionAtSourceTime = nil
      startWriting()
  }
}

Stop recording:

extension RecorderView {
  func stop() {
      guard isRecording else { return }
      isRecording = false
      videoWriter.finishWriting { [weak self] in
          self?.sessionAtSourceTime = nil
          guard let url = self?.videoWriter.outputURL else { return }
          let asset = AVURLAsset(url: url)
          //Do whatever you want with your asset here
      }
  }
}

You can able to pause and resume record like this:

extension RecorderView {
  func pause() {
      isRecording = false
  }
  
  func resume() {
      isRecording = true
  }
}

Don’t be mad if the recorded video without the mask :], I will show you at next step!

5. Add mask to recorded video

To apply the mask into the recorde video we need using AVAssetWriterInputPixelBufferAdaptor to do it!!! :]

Declare some more global variables:

fileprivate var videoWriterInputPixelBufferAdaptor: AVAssetWriterInputPixelBufferAdaptor!
fileprivate lazy var sDeviceRgbColorSpace = CGColorSpaceCreateDeviceRGB()
fileprivate lazy var bitmapInfo = CGBitmapInfo.byteOrder32Little
                                            .union(CGBitmapInfo(rawValue: CGImageAlphaInfo.noneSkipFirst.rawValue))

Update func func setupWriter:

fileprivate func setupWriter() {
  do {
      //...
      
      videoWriterInputPixelBufferAdaptor = AVAssetWriterInputPixelBufferAdaptor(assetWriterInput: videoWriterInput, sourcePixelBufferAttributes: [
              kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
              kCVPixelBufferWidthKey as String: Constant.Configuration.DefaultAssetSize.width,
              kCVPixelBufferHeightKey as String: Constant.Configuration.DefaultAssetSize.height,
              kCVPixelFormatOpenGLESCompatibility as String: true,
            ])

      videoWriter.startWriting()
  }
  catch let error {
      debugPrint(error.localizedDescription)
  }
}

Again back to edit method: func captureOutput(_:didOutputSampleBuffer:from:)

func captureOutput(_ captureOutput: AVCaptureOutput!,
                   didOutputSampleBuffer sampleBuffer: CMSampleBuffer!,
                   from connection: AVCaptureConnection!) {
  //...

  if captureOutput == videoDataOutput {
      //...
      
      //We don't write directly to `videoWriterInput`, will write to `videoWriterInputPixelBufferAdaptor`
      //if videoWriterInput.isReadyForMoreMediaData { 
      // //Write video buffer
      //  videoWriterInput.append(sampleBuffer)
      //}
      if writable {
          autoreleasepool { //Make sure `CVPixelBuffer` will release after used
          //Lock `pixelBuffer` before working on it
          CVPixelBufferLockBaseAddress(pixelBuffer, CVPixelBufferLockFlags(rawValue: 0))
          
          //Deep copy buffer pixel to avoid memory leak
          var renderedOutputPixelBuffer: CVPixelBuffer? = nil
          let options = [
              kCVPixelBufferCGImageCompatibilityKey as String: true,
              kCVPixelBufferCGBitmapContextCompatibilityKey as String: true,
          ] as CFDictionary
          let status = CVPixelBufferCreate(kCFAllocatorDefault,
                                           CVPixelBufferGetWidth(pixelBuffer),
                                           CVPixelBufferGetHeight(pixelBuffer),
                                           kCVPixelFormatType_32BGRA, options,
                                           &renderedOutputPixelBuffer)
          guard status == kCVReturnSuccess else { return }
          
          CVPixelBufferLockBaseAddress(renderedOutputPixelBuffer!,
                                       CVPixelBufferLockFlags(rawValue: 0))
          
          let renderedOutputPixelBufferBaseAddress = CVPixelBufferGetBaseAddress(renderedOutputPixelBuffer!)
          
          memcpy(renderedOutputPixelBufferBaseAddress,
                 CVPixelBufferGetBaseAddress(pixelBuffer),
                 CVPixelBufferGetHeight(pixelBuffer) * CVPixelBufferGetBytesPerRow(pixelBuffer))
          
          //Lock the copy of pixel buffer when working on ti
          CVPixelBufferLockBaseAddress(renderedOutputPixelBuffer!, CVPixelBufferLockFlags(rawValue: 0))
          if !features.isEmpty {
              //Create context base on copied buffer
              let context = CGContext(data: renderedOutputPixelBufferBaseAddress,
                                      width: CVPixelBufferGetWidth(renderedOutputPixelBuffer!),
                                      height: CVPixelBufferGetHeight(renderedOutputPixelBuffer!),
                                      bitsPerComponent: 8,
                                      bytesPerRow: CVPixelBufferGetBytesPerRow(renderedOutputPixelBuffer!),
                                      space: sDeviceRgbColorSpace,
                                      bitmapInfo: bitmapInfo.rawValue)
              
              for feature in features {
                  //Draw mask image
                  let faceImage = UIImage("face-image")!
                  context?.draw(faceImage.cgImage!, in: feature.bounds)
              }
          }
              
          //Make sure adaptor and writer able to write
          if videoWriterInputPixelBufferAdaptor.assetWriterInput.isReadyForMoreMediaData,
              canWrite() {
              //Write down to adator instead of `videoWriterInput`
              videoWriterInputPixelBufferAdaptor.append(renderedOutputPixelBuffer!, withPresentationTime: timestamp)
          }
          
          //Unlock buffers after processed on them 
          CVPixelBufferUnlockBaseAddress(renderedOutputPixelBuffer!,
                                         CVPixelBufferLockFlags(rawValue: 0))
          CVPixelBufferUnlockBaseAddress(pixelBuffer,
                                         CVPixelBufferLockFlags(rawValue: 0))
      }
  } else if writable,
      captureOutput == audioDataOutput,
      audioWriterInput.isReadyForMoreMediaData {
      //Write audio buffer
      audioWriterInput.append(sampleBuffer)
  }
}