By M Khalid Junaid


2013-10-21 20:03:55 8 Comments

There may be a scenario we need to get the text from word documents for the future use to search the string in the document uploaded by user like for searching in cv's/resumes and occurs a common problem that how to get the text , Open and read a user uploaded Word document,there are some helpful links but don't cure the whole problem.We need to get the text at the time of uploading and save text in database and we can easily search within the database.

4 comments

@FRanklinDavid 2017-09-24 11:57:02

//For DOCX.If you want to preserve white spaces, also take care of tables tr and tc, use the codes below: Modify it to your taste. Cos it downloads the file from a remote or local

//=========DOCX===========
function extractDocxText($url,$file_name){
        $docx = get_url($url);
        file_put_contents("tempf.docx",$docx);
        $xml_filename = "word/document.xml"; //content file name
        $zip_handle = new ZipArchive;
        $output_text = "";
        if(true === $zip_handle->open("tempf.docx")){
            if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
                $xml_datas = $zip_handle->getFromIndex($xml_index);
                //file_put_contents($input_file.".xml",$xml_datas);
                $replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas);
                $replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines);
                $replace_tab = preg_replace('/<w:tab\/>/',"\t",$replace_tableRows);
                $replace_paragraphs = preg_replace('/<\/w:p>/',"\n\r",$replace_tab);
                $replace_other_Tags = strip_tags($replace_paragraphs);          
                $output_text = $replace_other_Tags;
            }else{
                $output_text .="";
            }
            $zip_handle->close();
        }else{
        $output_text .=" ";
        }
        chmod("tempf.docx", 0777);  unlink(realpath("tempf.docx"));
        //save to file or echo content
        file_put_contents($file_name,$output_text);
        echo $output_text;
    }

//========PDF===========
//Requires installation in your Linux server
//sudo su
//apt-get install xpdf
function extractPdfText($url,$PDF_fullpath_or_Filename){
    $pdf = get_url($url);
    file_put_contents ("temppdf.txt", $pdf);
    $content = pdf2text("temppdf.txt");
    chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt"));
    echo $content;
    file_put_contents($PDF_fullpath_or_Filename,$content);
    }



//========DOC==========
function extractDocText($url,$file_name){
    $doc = get_url($url);
    file_put_contents ("tempf.txt", $doc);

    $fileHandle = fopen("tempf.txt", "r");
    $line = @fread($fileHandle, filesize("tempf.txt"));
    $lines = explode(chr(0x0D),$line);
    $outtext = "";
    foreach($lines as $thisline){
        $pos = strpos($thisline, chr(0x00));
        if (($pos !== FALSE)||(strlen($thisline)==0))
        {} else {$outtext .= $thisline."\n\r";}
        }
    $content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/','  ',$outtext);

    //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt"));
    echo $content;
    file_put_contents($file_name,$content);
    }


//========XLSX==========
function extractXlsxText($url,$file_name){
    $xlsx = get_url($url);
    file_put_contents ("tempf.txt", $xlsx);
    $content = "";
    $dir = 'tempforxlsx';
    // Unzip
    $zip = new ZipArchive();
    $zip->open("tempf.txt");
    $zip->extractTo($dir);
    // Open up shared strings & the first worksheet
    $strings = simplexml_load_file($dir . '/xl/sharedStrings.xml');
    $sheet   = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml');
    // Parse the rows
    $xlrows = $sheet->sheetData->row;
    foreach ($xlrows as $xlrow) {
        $arr = array();

        // In each row, grab it's value
        foreach ($xlrow->c as $cell) {
            $v = (string) $cell->v;

            // If it has a "t" (type?) of "s" (string?), use the value to look up string value
            if (isset($cell['t']) && $cell['t'] == 's') {
                $s  = array();
                $si = $strings->si[(int) $v];

                // Register & alias the default namespace or you'll get empty results in the xpath query
                $si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main');
                // Cat together all of the 't' (text?) node values
                foreach($si->xpath('.//n:t') as $t) {
                    $content .= $t."  ";}   }
            }
        }
    echo $content;
    file_put_contents($file_name,$content);
    }


//========PPT========== 
function extractPptText($url,$file_name){
    $ppt = file_get_contents($url);
    file_put_contents ("tempf.ppt", $ppt);
    $fileHandle = fopen("tempf.ppt", "r");
    $line = @fread($fileHandle, filesize("tempf.ppt"));
    $lines = explode(chr(0x0f),$line);
    $outtext = '';

    foreach($lines as $thisline) {
        if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
            $text_line = substr($thisline, 4);
            $end_pos   = strpos($text_line, chr(0x00));
            $text_line = substr($text_line, 0, $end_pos);
            $text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/',"  ",$text_line);
            $outtext = substr($text_line, 0, $end_pos)."\n".$outtext;
        }
    }
    //echo $outtext;
    file_put_contents($file_name,$outtext);
    }

//========PPTX==========
function extractPptxText($url,$file_name){
    $xls = get_url($url);
    file_put_contents ("tempf.txt", $xls);
    $zip_handle = new ZipArchive;
    $output_text = ' ';
    if(true === $zip_handle->open("tempf.txt")){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index); // these four lines of codes
                                                                // below were
            $xml_handle = new DOMDocument ();                   // added by me in order
            $xml_handle->preserveWhiteSpace = true;             // to preserve space between
            $xml_handle->formatOutput = true;                   // each read data
            $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= $xml_handle->saveXML();
            $slide_number++;
            }
        if($slide_number == 1){
            $output_text .= "";
        }
        $zip_handle->close();
    }else{
    $output_text .= "";
    }
    echo $output_text;
    file_put_contents($file_name,$output_text);
    }

    /*

==========================================================================
=========================================================================
And below is get_url() function: Better than fie_get_contents();
*/

function get_url( $url,$timeout = 5 )
    {
        $url = str_replace( "&amp;", "&", urldecode(trim($url)) );
        $ch = curl_init();
        curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
        curl_setopt( $ch, CURLOPT_URL, $url );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $ch, CURLOPT_ENCODING, "" );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );    # required for https urls
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
        $content = curl_exec( $ch );
        //$response = curl_getinfo( $ch ); 
        curl_close ( $ch );
        return $content;
    }

@M Khalid Junaid 2013-10-21 20:03:55

Here is a simple class which does the right job for .doc/.docx , PHP docx reader: Convert MS Word Docx files to text.

    class DocxConversion{
    private $filename;

    public function __construct($filePath) {
        $this->filename = $filePath;
    }

    private function read_doc() {
        $fileHandle = fopen($this->filename, "r");
        $line = @fread($fileHandle, filesize($this->filename));   
        $lines = explode(chr(0x0D),$line);
        $outtext = "";
        foreach($lines as $thisline)
          {
            $pos = strpos($thisline, chr(0x00));
            if (($pos !== FALSE)||(strlen($thisline)==0))
              {
              } else {
                $outtext .= $thisline." ";
              }
          }
         $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/","",$outtext);
        return $outtext;
    }

    private function read_docx(){

        $striped_content = '';
        $content = '';

        $zip = zip_open($this->filename);

        if (!$zip || is_numeric($zip)) return false;

        while ($zip_entry = zip_read($zip)) {

            if (zip_entry_open($zip, $zip_entry) == FALSE) continue;

            if (zip_entry_name($zip_entry) != "word/document.xml") continue;

            $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));

            zip_entry_close($zip_entry);
        }// end while

        zip_close($zip);

        $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
        $content = str_replace('</w:r></w:p>', "\r\n", $content);
        $striped_content = strip_tags($content);

        return $striped_content;
    }

 /************************excel sheet************************************/

function xlsx_to_text($input_file){
    $xml_filename = "xl/sharedStrings.xml"; //content file name
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
        if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text = strip_tags($xml_handle->saveXML());
        }else{
            $output_text .="";
        }
        $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}

/*************************power point files*****************************/
function pptx_to_text($input_file){
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= strip_tags($xml_handle->saveXML());
            $slide_number++;
        }
        if($slide_number == 1){
            $output_text .="";
        }
        $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}


    public function convertToText() {

        if(isset($this->filename) && !file_exists($this->filename)) {
            return "File Not exists";
        }

        $fileArray = pathinfo($this->filename);
        $file_ext  = $fileArray['extension'];
        if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
        {
            if($file_ext == "doc") {
                return $this->read_doc();
            } elseif($file_ext == "docx") {
                return $this->read_docx();
            } elseif($file_ext == "xlsx") {
                return $this->xlsx_to_text();
            }elseif($file_ext == "pptx") {
                return $this->pptx_to_text();
            }
        } else {
            return "Invalid File Type";
        }
    }

}

Document_file_format Doc files are binary blobs.They can be read by using fopen.While .docx files are just zip files and xml files xml files in a zipfile container (source wikipedia) you can read them by using zip_open.

Usage of above class

$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
echo $docText= $docObj->convertToText();

@Mohammed Sufian 2014-01-26 09:58:36

thank you for this awesome answer.. i am trying to use your code .. but it is not working ..it displays me empty.. and i thing my zip not working.. can u help me out to find the problem..

@Volatil3 2014-05-29 05:17:57

How efficient is this class to read thousands of document? does it read entire content at once or can read first page first?

@Strawberry 2015-04-24 05:30:28

@M Khalid Junaid How to read a docx file with images or math type equation, whether it has some procedures to do that or library?

@M Khalid Junaid 2015-04-26 07:44:00

@Strawberry for image i have added a repository you can see an example there Extract Images from Docx file for math type equation i am not sure regarding your idea or may i didn't get your point

@Strawberry 2015-04-28 05:06:00

stackoverflow.com/questions/29791764/… this is what exactly i'm facing do you have any idea on this [email protected]

@Himanshu Upadhyay 2016-02-27 06:16:29

what is 'xl/sharedStrings.xml' file in xlsx to text function ?

@Rohan Gala 2016-03-19 11:43:52

This is working for conversion for docx files to text but the format of output text is changed. Any way to maintain the formatting also?

@COMisHARD 2016-03-19 14:32:33

This works great, except it does not extract footnotes from a word document. Any tips on how I could modify this so that footnotes are extracted and appended to the end of the string?

@HartleySan 2016-09-10 17:35:22

Great class! Is there any way to change the read_doc function to handle UTF-8 / non-English characters? Thanks.

@Hassan Dad Khan 2016-12-29 07:02:56

Is it possible to read the text with spaces and keep them as well. It converts text into raw form e.g First Name: Abdullah khan Occupation: Developer. Which makes it difficult to extract only name by using regular expression.

@Thomas Valadez 2017-06-22 00:45:51

I have an issue with this class. I am wondering if anyone else experienced the same thing and got it working... stackoverflow.com/questions/44687738/…

@masud_moni 2017-08-01 11:48:55

working like a charm. Just one thing - I cant get the auto numbering (ordered list). Is there any way to complete this?

@Nick 2018-05-09 04:16:55

I've cleaned this up to make everything more cohesive in naming, cleared out some unnecessary code and added a wrapper for also extracting text from PDF files, when using a 3rd party library. The code can be viewed in the gist here: gist.github.com/nickrouty/6f5ed07e79d2223b279fc5e662264b10

@SGh 2019-08-14 19:17:18

It works like a charm!

@Jumper Pot 2014-11-27 06:16:03

From DOC file

$filename = 'ypue file';         
    if ( file_exists($filename) ) {        

    if ( ($fh = fopen($filename, 'r')) !== false ) {

    $headers = fread($fh, 0xA00);

    $n1 = ( ord($headers[0x21C]) - 1 );

    $n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );

    $n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );

    $n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );


    $textLength = ($n1 + $n2 + $n3 + $n4);

    $extracted_plaintext = fread($fh, $textLength);

    echo nl2br($extracted_plaintext);

    print_r(extract_emails_from($extracted_plaintext));    

         }

        }

    function extract_emails_from($string) {
             preg_match_all("/[\._a-zA-Z0-9-][email protected][\._a-zA-Z0-9-]+/i", $string, $matches);
             return $matches[0];
    }

From DOCX :

    /*Name of the document file*/
    $document = 'your file';

    /**Function to extract text*/
    function extracttext($filename) {
        //Check for extension
        $ext = end(explode('.', $filename));

    //if its docx file
    if($ext == 'docx')
    $dataFile = "word/document.xml";
    //else it must be odt file
    else
    $dataFile = "content.xml";     

    //Create a new ZIP archive object
    $zip = new ZipArchive;

    // Open the archive file
    if (true === $zip->open($filename)) {
        // If successful, search for the data file in the archive
        if (($index = $zip->locateName($dataFile)) !== false) {
            // Index found! Now read it to a string
            $text = $zip->getFromIndex($index);
            // Load XML from a string
            // Ignore errors and warnings
            $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            // Remove XML formatting tags and return the text
            return strip_tags($xml->saveXML());
        }
        //Close the archive file
        $zip->close();
    }

    // In case of failure return a message
    return "File not found";
}

echo extracttext($document);

@OshoParth 2015-09-01 10:27:25

Hi it would be great if some comments could be added to the code to make it more understandable as am facing some issues in understanding it .

@Jezz 2014-12-22 14:34:12

For docx documents, I suggest use of docx2txt tool (available at least on Debian/Ubuntu):

docx2txt < your_file.docx

README explain how to integrate it with vim. Add to your .vimrc:

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly.
autocmd BufReadPre *.docx set ro
autocmd BufReadPost *.docx %!docx2txt

(it also explain how to integrate with emacs).

For hackers, this tool is written in perl.

@MKN Web Solutions 2015-05-22 19:12:10

This is only for docx's and not .docs

Related Questions

Sponsored Content

26 Answered Questions

[SOLVED] How do I get (extract) a file extension in PHP?

  • 2008-10-06 11:00:02
  • e-satis
  • 541669 View
  • 683 Score
  • 26 Answer
  • Tags:   php file-extension

1 Answered Questions

[SOLVED] How to search multiple DOCX files for a string within a Word field?

2 Answered Questions

[SOLVED] How to extract text data from MS-Word doc file

  • 2013-02-13 10:15:27
  • Thomas
  • 9388 View
  • 2 Score
  • 2 Answer
  • Tags:   c# ms-word

3 Answered Questions

[SOLVED] How do I extract data from a doc/docx file using Python

2 Answered Questions

[SOLVED] Editing a Word .docx file from C# application

  • 2015-07-28 17:58:30
  • BoltBait
  • 7812 View
  • 2 Score
  • 2 Answer
  • Tags:   c# ms-word

4 Answered Questions

1 Answered Questions

[SOLVED] Unable to extract mailmerge fields from Word DOCX using PHP and DOM

  • 2012-10-22 21:16:11
  • Sean Cunningham
  • 1844 View
  • 1 Score
  • 1 Answer
  • Tags:   php xml ms-word docx

1 Answered Questions

[SOLVED] Compress MS word docx documents using barcode fonts in PHP scipt

  • 2012-03-22 10:00:08
  • mat
  • 905 View
  • 0 Score
  • 1 Answer
  • Tags:   php ms-word docx

2 Answered Questions

[SOLVED] Copy text from word file to a new word

3 Answered Questions

[SOLVED] How to index and search .doc files

Sponsored Content