# How to Build Web Document Scanner Using OpenCV-Python

Many excellent document mobile apps support not only image capture, but also edge detection and perspective transformation. If you are interested in these computer vision technologies, you can use OpenCV to create a free document scanner app yourself. In this post, I want to share how to use OpenCV-Python to create a web document scanner step by step.

## Setting Up Environment

`pip3 install flask`

Install OpenCV 3.3.0 for Python:

`pip3 install opencv-python`

`python3 setup.py build install`

To compile NumPy source code on Windows 10, install Microsoft Visual C++ Compiler for Python 2.7.

## Web Document Scanner

### Steps of Building the App

Create document.py to do edge detection and perspective transformation:

```import cv2
import rect
import numpy as np

class Scanner(object):
# http://www.pyimagesearch.com/2014/08/25/4-point-opencv-getperspective-transform-example/
def four_point_transform(self, image, rect):
# obtain a consistent order of the points and unpack them
# individually
(tl, tr, br, bl) = rect

# compute the width of the new image, which will be the
# maximum distance between bottom-right and bottom-left
# x-coordiates or the top-right and top-left x-coordinates
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))

# compute the height of the new image, which will be the
# maximum distance between the top-right and bottom-right
# y-coordinates or the top-left and bottom-left y-coordinates
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))

# now that we have the dimensions of the new image, construct
# the set of destination points to obtain a "birds eye view",
# (i.e. top-down view) of the image, again specifying points
# in the top-left, top-right, bottom-right, and bottom-left
# order
dst = np.array([
[0, 0],
[maxWidth - 1, 0],
[maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]], dtype = "float32")

# compute the perspective transform matrix and then apply it
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

# return the warped image
return warped

# https://github.com/vipul-sharma20/document-scanner
def detect_edge(self, image, enabled_transform = False):
dst = None
orig = image.copy()

gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(blurred, 0, 20)
_, contours, _ = cv2.findContours(edged, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)

contours = sorted(contours, key=cv2.contourArea, reverse=True)

for cnt in contours:
epsilon = 0.051 * cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, epsilon, True)

if len(approx) == 4:
target = approx
cv2.drawContours(image, [target], -1, (0, 255, 0), 2)

if enabled_transform:
approx = rect.rectify(target)
# pts2 = np.float32([[0,0],[800,0],[800,800],[0,800]])
# M = cv2.getPerspectiveTransform(approx,pts2)
# dst = cv2.warpPerspective(orig,M,(800,800))
dst = self.four_point_transform(orig, approx)
break

return image, dst
```

Create camera.py to capture frames from a camera:

```import cv2
from document import Scanner

class VideoCamera(object):
def __init__(self):
# Open a camera
self.cap = cv2.VideoCapture(2)

# Initialize video recording environment
self.is_record = False
self.out = None
self.transformed_frame = None

self.scanner = Scanner()
self.cached_frame = None

def __del__(self):
self.cap.release()

def get_video_frame(self):
if ret:
frame, _ = self.scanner.detect_edge(frame)
self.cached_frame = frame
ret, jpeg = cv2.imencode('.jpg', frame)
return jpeg.tobytes()
else:
return None

def capture_frame(self):
if ret:
_, frame = self.scanner.detect_edge(frame, True)
ret, jpeg = cv2.imencode('.jpg', frame)
self.transformed_frame = jpeg.tobytes()
else:
return None

def get_cached_frame(self):
return self.cached_frame

def get_image_frame(self):
return self.transformed_frame

```

Note: if you have only one device connected, the parameter in cv2.VideoCapture() should be 0.

Create server.py to stream camera frames to your web client:

```from flask import Flask, render_template, Response, jsonify, request
from camera import VideoCamera

video_camera = None

@app.route('/')
def index():
return render_template('index.html')

@app.route('/capture_status', methods=['POST'])
def capture_status():
global video_camera

if video_camera == None:
video_camera = VideoCamera()

json = request.get_json()

status = json['status']

if status == "true":
video_camera.capture_frame()
return jsonify(result="done")

def video_frame():
global video_camera

if video_camera == None:
video_camera = VideoCamera()

while True:
frame = video_camera.get_video_frame()

if frame is not None:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
else:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + video_camera.get_cached_frame() + b'\r\n\r\n')

def image_frame():
global video_camera

if video_camera == None:
video_camera = VideoCamera()

frame = video_camera.get_image_frame()

if frame is not None:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')

@app.route('/video_viewer')
def video_viewer():
return Response(video_frame(),
mimetype='multipart/x-mixed-replace; boundary=frame')

@app.route('/image_viewer')
def image_viewer():
return Response(image_frame(),
mimetype='multipart/x-mixed-replace; boundary=frame')

if __name__ == '__main__':
`python server.py`