Welcome folks today in this blog post we will be extracting text from word
docx file and save it as text file in browser using node.js
and express. All the full source code of the application is shown below.
Get Started
In order to get started you need to make a new node.js
project using the below command as shown below
npm init -y
npm i express
npm i multer
npm i word-extractor
And after that you will see the below directory
structure of the express app as shown below
And now you need to create the index.js
file and copy paste the following code
index.js
1 2 3 4 5 6 7 |
const WordExtractor = require("word-extractor"); const extractor = new WordExtractor(); const extracted = extractor.extract("file.doc"); extracted.then(function (doc) { console.log(doc.getBody()); }); |
As you can see we are importing the word-extractor
package at the top and then we are using the extract()
method to get the text present inside the .docx
file and then inside the callback function we are printing the text
on the terminal.
Making the Express App
Now we need to make the public/uploads
directory where we will be storing all the uploaded
docx files and now we need to copy paste the below code inside the index.js
file
index.js
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
const express = require("express"); const WordExtractor = require("word-extractor"); const fs = require('fs') const multer = require("multer"); const path = require("path"); const app = express(); app.use(express.static("public")); var storage = multer.diskStorage({ destination: function (req, file, cb) { cb(null, "public/uploads"); }, filename: function (req, file, cb) { cb(null, Date.now() + path.extname(file.originalname)); //Appending extension }, }); var upload = multer({ storage: storage }); app.use(express.json()); app.use(express.urlencoded({ extended: false })); app.get("/", (req, res) => { res.sendFile(__dirname + "/index.html"); }); app.post("/convert", upload.single("file"), (req, res) => { let outputfilepath = "public/uploads/" + Date.now() + ".txt" if (req.file) { const extractor = new WordExtractor(); const extracted = extractor.extract(req.file.path); extracted.then(function (doc) { console.log(doc.getBody()); fs.writeFileSync(outputfilepath,doc.getBody(),"utf-8") res.download(outputfilepath) }); } }); app.listen(5000, () => { console.log("App is listening on port 5000"); }); |
As you can see in the above code we are importing all the required modules and starting the express
app at port 5000 and then we are loading the index.html
template whenever user goes to the /
route and then we are having the index.html
where we having the simple html5
form which contains the input
field where user can select the docx
file to upload and then we are using the multer
library to upload the files inside the public/uploads
directory structure and then we are extracting the text from the docx file and making the text file and downloading it as an attachment in the browser.
index.html
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Extract Text From DOCX File</title> </head> <body> <form action="/convert" method="post" enctype="multipart/form-data"> <input type="file" name="file" accept=".docx" id="" required> <button type="submit">Extract Text</button> </form> </body> </html> |