How to Build Web Document Scanner Using OpenCV-Python
Many excellent document mobile apps support not only image capture, but also edge detection and perspective transformation. If you are interested in these computer vision technologies, you can use OpenCV to create a free document scanner app yourself. In this post, I want to share how to use OpenCV-Python to create a web document scanner step by step.
Setting Up Environment
Download Python 3.5.
Install Flask:
pip3 install flask
Install OpenCV 3.3.0 for Python:
pip3 install opencv-python
Download the latest NumPy 1.11.2. Unzip the package and build it:
python3 setup.py build install
To compile NumPy source code on Windows 10, install Microsoft Visual C++ Compiler for Python 2.7.
Web Document Scanner
Article and Code References
- https://github.com/vipul-sharma20/document-scanner
- http://www.pyimagesearch.com/2014/08/25/4-point-opencv-getperspective-transform-example/
- http://docs.opencv.org/3.1.0/dd/d49/tutorial_py_contour_features.html
- https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm
- http://www.pyimagesearch.com/2015/04/06/zero-parameter-automatic-canny-edge-detection-with-python-and-opencv/
Steps of Building the App
Create document.py to do edge detection and perspective transformation:
import cv2
import rect
import numpy as np
class Scanner(object):
# http://www.pyimagesearch.com/2014/08/25/4-point-opencv-getperspective-transform-example/
def four_point_transform(self, image, rect):
# obtain a consistent order of the points and unpack them
# individually
(tl, tr, br, bl) = rect
# compute the width of the new image, which will be the
# maximum distance between bottom-right and bottom-left
# x-coordiates or the top-right and top-left x-coordinates
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))
# compute the height of the new image, which will be the
# maximum distance between the top-right and bottom-right
# y-coordinates or the top-left and bottom-left y-coordinates
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))
# now that we have the dimensions of the new image, construct
# the set of destination points to obtain a "birds eye view",
# (i.e. top-down view) of the image, again specifying points
# in the top-left, top-right, bottom-right, and bottom-left
# order
dst = np.array([
[0, 0],
[maxWidth - 1, 0],
[maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]], dtype = "float32")
# compute the perspective transform matrix and then apply it
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
# return the warped image
return warped
# https://github.com/vipul-sharma20/document-scanner
def detect_edge(self, image, enabled_transform = False):
dst = None
orig = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(blurred, 0, 20)
_, contours, _ = cv2.findContours(edged, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
for cnt in contours:
epsilon = 0.051 * cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, epsilon, True)
if len(approx) == 4:
target = approx
cv2.drawContours(image, [target], -1, (0, 255, 0), 2)
if enabled_transform:
approx = rect.rectify(target)
# pts2 = np.float32([[0,0],[800,0],[800,800],[0,800]])
# M = cv2.getPerspectiveTransform(approx,pts2)
# dst = cv2.warpPerspective(orig,M,(800,800))
dst = self.four_point_transform(orig, approx)
break
return image, dst
Create camera.py to capture frames from a camera:
import cv2
from document import Scanner
class VideoCamera(object):
def __init__(self):
# Open a camera
self.cap = cv2.VideoCapture(2)
# Initialize video recording environment
self.is_record = False
self.out = None
self.transformed_frame = None
self.scanner = Scanner()
self.cached_frame = None
def __del__(self):
self.cap.release()
def get_video_frame(self):
ret, frame = self.cap.read()
if ret:
frame, _ = self.scanner.detect_edge(frame)
self.cached_frame = frame
ret, jpeg = cv2.imencode('.jpg', frame)
return jpeg.tobytes()
else:
return None
def capture_frame(self):
ret, frame = self.cap.read()
if ret:
_, frame = self.scanner.detect_edge(frame, True)
ret, jpeg = cv2.imencode('.jpg', frame)
self.transformed_frame = jpeg.tobytes()
else:
return None
def get_cached_frame(self):
return self.cached_frame
def get_image_frame(self):
return self.transformed_frame
Note: if you have only one device connected, the parameter in cv2.VideoCapture() should be 0.
Create server.py to stream camera frames to your web client:
from flask import Flask, render_template, Response, jsonify, request
from camera import VideoCamera
app = Flask(__name__)
video_camera = None
@app.route('/')
def index():
return render_template('index.html')
@app.route('/capture_status', methods=['POST'])
def capture_status():
global video_camera
if video_camera == None:
video_camera = VideoCamera()
json = request.get_json()
status = json['status']
if status == "true":
video_camera.capture_frame()
return jsonify(result="done")
def video_frame():
global video_camera
if video_camera == None:
video_camera = VideoCamera()
while True:
frame = video_camera.get_video_frame()
if frame is not None:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
else:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + video_camera.get_cached_frame() + b'\r\n\r\n')
def image_frame():
global video_camera
if video_camera == None:
video_camera = VideoCamera()
frame = video_camera.get_image_frame()
if frame is not None:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
@app.route('/video_viewer')
def video_viewer():
return Response(video_frame(),
mimetype='multipart/x-mixed-replace; boundary=frame')
@app.route('/image_viewer')
def image_viewer():
return Response(image_frame(),
mimetype='multipart/x-mixed-replace; boundary=frame')
if __name__ == '__main__':
app.run(host='0.0.0.0', threaded=True)
Run the app:
python server.py