Use ML Kit Firebase with SwiftUI

I had a look at ML Kit with Firebase recently and found this Tutorial:
https://www.raywenderlich.com/6565-ml-kit-tutorial-for-ios-recognizing-text-in-images

I wonder if anyone tried to replicate it with SwiftUI or if there are any sample projects out there.
I found some for the Vision framework by Apple but the ones for using Firebase were all using UIKit.

This is the solution I tried but it did not work - I believe that the size of the UImage I get by image.size does not match the size of the image that is used by Firebase. But maybe someone else has an idea how to solve this?

The result...

IMG_1693.PNG

So this is the view which does the calculation:

struct ImageScanned: View {
    var image: UIImage
    @Binding var rectangles: [DetectedRectangle]
    @State var viewSize: CGSize = .zero

    var body: some View {
        // TODO: fix scaling
        ZStack {
            Image(uiImage: image)
                .resizable()
                .aspectRatio(CGSize(width: image.size.width, height: image.size.height), contentMode: .fill)
                .overlay(
                    GeometryReader { geometry in
                        ZStack {
                            ForEach(self.transformRectangles(geometry: geometry)) { rect in
                                DetectedRectangleView(rectangle: rect)
                            }
                        }
                    }
                )
        }
    }

    private func transformRectangles(geometry: GeometryProxy) -> [DetectedRectangle] {
        var rectangles: [DetectedRectangle] = []

        let viewSize = geometry.frame(in: .global).size

        // 2
        let resolutionView = viewSize.width / viewSize.height
        let resolutionImage = self.image.size.width / self.image.size.height

        // 3
        var scale: CGFloat
        if resolutionView > resolutionImage {
          scale = viewSize.height / self.image.size.height
        } else {
          scale = viewSize.width / self.image.size.width
        }

        for rect in self.rectangles {

            // 4
            let featureWidthScaled = rect.width * scale
            let featureHeightScaled = rect.height * scale

            // 5
            let imageWidthScaled = rect.width * scale
            let imageHeightScaled = rect.height * scale
            let imagePointXScaled = (viewSize.width - imageWidthScaled) / 2
            let imagePointYScaled = (viewSize.height - imageHeightScaled) / 2

            // 6
            let featurePointXScaled = imagePointXScaled + rect.x * scale
            let featurePointYScaled = imagePointYScaled + rect.y * scale

            rectangles.append(DetectedRectangle(width: featureWidthScaled,
                                                height: featureHeightScaled,
                                                x: featurePointXScaled,
                                                y: featurePointYScaled))
        }
        return rectangles
    }
}
This is the one that forms the rectangles:
struct DetectedRectangleView: View {
    var rectangle: DetectedRectangle

    var body: some View {
        Rectangle()
            .path(in: CGRect(
                x: rectangle.x,
                y: rectangle.y,
                width: rectangle.width,
                height: rectangle.height
            ))
            .foregroundColor(Color.white)
            .opacity(0.7)
    }
}

struct DetectedRectangle: Identifiable {
    var id = UUID()
    var width: CGFloat = 0
    var height: CGFloat = 0
    var x: CGFloat = 0
    var y: CGFloat = 0
    var text: String = ""
}

The view which contains the above views:
struct StartScanView: View {
    @State var showCaptureImageView: Bool = false
    @State var image: UIImage? = nil
    @State var rectangles: [DetectedRectangle] = []

    var body: some View {
        ZStack {
            if showCaptureImageView {
                CaptureImageView(isShown: $showCaptureImageView, image: $image)
            } else {
                VStack {
                    Button(action: {
                        self.showCaptureImageView.toggle()
                    }) {
                        Text("Start Scanning")
                    }

                    // show here View with rectangles on top of image
                    if self.image != nil {
                        ImageScanned(image: self.image ?? UIImage(), rectangles: $rectangles)
                    }

                    Button(action: {
                        self.processImage()
                    }) {
                        Text("Process Image")
                    }
                }
            }
        }
    }

    func processImage() {
        let scaledImageProcessor = ScaledElementProcessor()
        if image != nil {
            scaledImageProcessor.process(in: image!) { text in
                for block in text.blocks {
                    for line in block.lines {
                        for element in line.elements {
                            let frame = element.frame
                            self.rectangles.append(DetectedRectangle(width: frame.width, height: frame.height, x: frame.minX, y: frame.minY, text: element.text))
                        }
                    }
                }
            }
        }
    }
}

Lastly, the views that do the image capturing (so this is complete):

CaptureImageView
struct CaptureImageView {
    @Binding var isShown: Bool
    @Binding var image: UIImage?

    func makeCoordinator() -> Coordinator {
        return Coordinator(isShown: $isShown, image: $image)
    }
}

extension CaptureImageView: UIViewControllerRepresentable {
    func makeUIViewController(context: UIViewControllerRepresentableContext<CaptureImageView>) -> UIImagePickerController {
        let picker = UIImagePickerController()
        picker.sourceType = .camera
        picker.delegate = context.coordinator
        return picker
    }

    func updateUIViewController(_: UIImagePickerController,
                                context _: UIViewControllerRepresentableContext<CaptureImageView>) {}
}
Coordinator (for dealing with SwiftUI & UIKit images)
class Coordinator: NSObject, UINavigationControllerDelegate, UIImagePickerControllerDelegate {
    @Binding var isCoordinatorShown: Bool
    @Binding var imageInCoordinator: UIImage?
    init(isShown: Binding<Bool>, image: Binding<UIImage?>) {
        _isCoordinatorShown = isShown
        _imageInCoordinator = image
    }

    func imagePickerController(_: UIImagePickerController,
                               didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any]) {
        guard let unwrapImage = info[UIImagePickerController.InfoKey.originalImage] as? UIImage else { return }
        imageInCoordinator = unwrapImage
        isCoordinatorShown = false
    }

    func imagePickerControllerDidCancel(_: UIImagePickerController) {
        isCoordinatorShown = false
    }
}
Firebase Vision Text Recognizer
class ScaledElementProcessor {
    let vision = Vision.vision()
    var textRecognizer: VisionTextRecognizer!

    init() {
        // Provide language hints
        let options = VisionCloudTextRecognizerOptions()
        options.languageHints = ["nl"]
        textRecognizer = vision.cloudTextRecognizer(options: options)
    }

    func process(in image: UIImage?,
                 callback: @escaping (_ text: VisionText) -> Void) {
        guard let image = image else { return }
        let visionImage = VisionImage(image: image)

        // provide metadata to improve text recognition
        let metadata = VisionImageMetadata()
        metadata.orientation = .topLeft
        visionImage.metadata = metadata

        textRecognizer.process(visionImage) { result, error in
            guard
                error == nil,
                let result = result,
                !result.text.isEmpty
            else {
                return
            }
            callback(result)
        }
    }
}

I tried already to play around with some calculations like the one from the Firebase example project (also UIKit) but the results were all the same - the images were all too small. I hope, that someone has an idea.

@eszter Thank you for sharing your solution - much appreciated!

This topic was automatically closed after 166 days. New replies are no longer allowed.