Public Records Request Tools

These are various scripts written by Nicole to assist the help desk in doing public records requests.

  • email.php
    This script parses all the .eml files in a directory tree and attempts to scan them for a given list of words. It requires the MimeMailParser class.
    <?php
     
    // Import our mail parser
    require_once('MimeMailParser.class.php');
     
    // The list of words we want to keep
    $words = array(
    	'bail',
    	'pretrial release',
    	'pretrial justice',
    	'bond',
    	'bondsman',
    	'custody release',
    	'house arrest with electronic monitoring',
    	'written promise',
    	'surety',
    	'jail',
    	'jailer',
    	'juvenile detention center',	
    );
     
    // Get all of the files
    $files = getDirContents( getcwd() . '/emails' );
     
    for( $i = 0; $i < count( $files ); $i++ ) {
    	$move = false;
     
    	$filename = $files[ $i ];
     
    	echo 'Working on: ' . $filename . '<br>';
     
    	$parser = new MimeMailParser();
     
    	$parser->setPath( $filename );
     
    	$subject = $parser->getHeader( 'subject' );
     
    	$text = $parser->getMessageBody( 'text' );
     
    	if( contains( $subject, $words ) || contains( $text, $words ) ) {
    		unset( $parser );
     
    		// Get our destination directory
    		$newname = str_replace( 'C:\wamp64\www\email\emails', 'C:\wamp64\www\email\SAVED-EMAILS', $filename );
     
    		// Try to copy the file	
    		if( !copy( $filename, $newname ) ) {
     
    			// copy returned false, which means we hit an error!
    			echo 'Error on copy: ' . $filename . ' to ' . $newname . '<br>';	
     
    		// Copy was fine
    		} else {
     
    			// Delete the source file
    			unlink( $filename );
    		}
     
    	} else {
    		unset( $parser );
     
    		// Delete the source file
    		unlink( $filename );
    	}
    }
     
     
    /*
    $path = 'test.eml'; 
    $parser = new MimeMailParser(); 
    $parser->setPath( $path );
     
    $subject = $parser->getHeader( 'subject' );
     
    $text = $parser->getMessageBody( 'text' );
     
    echo $subject;
     
    echo '<br>';
     
    echo $text;
    */
     
     
    /*
     *  https://stackoverflow.com/questions/24783862/list-all-the-files-and-folders-in-a-directory-with-php-recursive-function
     */
    function getDirContents($dir, &$results = array()){
        $files = scandir($dir);
     
        foreach($files as $key => $value){
            $path = realpath($dir.DIRECTORY_SEPARATOR.$value);
     
            if(!is_dir($path)) {
                $results[] = $path;
            } else if($value != "." && $value != "..") {
                getDirContents($path, $results);
                $results[] = $path;
            }
        }
     
        return $results;
    }
     
     
    /*
     * https://stackoverflow.com/questions/13795789/check-if-string-contains-word-in-array
     */
    function contains($str, array $arr)
    {
        foreach($arr as $a) {
            if (stripos($str,$a) !== false) return true;
        }
        return false;
    }
  • files.php
    This is a script that reads all the files in a directory tree and attempts to determine whether or not they contain any of the words in a list.
    <?php
     
    /*
     * This file is best run from the command line
     *  in the root directory of the files that need
     *  to be scanned.
     */
     
    /*
        composer.json contents:
     
        {
            "require": {
                "smalot/pdfparser": "*"
            }
        }
     
    */
     
    // Set the memory limit to something stupidly high so we can load large files
    ini_set('memory_limit','128G');
     
    // Include the composer autoloader.
    include 'vendor/autoload.php';
     
    // The list of words we want to keep
    $words = array(
    	'judicial standards commission',
    	'hunt gwynn',
    	'gwynn',
    );
     
    // The fallback word which will be used to "force" a document to be saved when
    //  it cannot be read. This fixes problems with "secure" PDFs.
    $catchWord = 'gwynn';
     
    // Only keep files on or after this date
    $start = '2018-06-01';
     
    // Get all of the files
    $files = getDirContents( getcwd() . '/files' );
     
    // Keep running while we have stuff to do
    while( count( $files ) > 0 ) {
     
        // Remove the first filename on the array
        $filename = array_shift( $files );
     
        // Windows loves to make stuff read only, fix that
        chmod( $filename, 0777 );
     
        // Output what file we're working on
    	echo "Working on: " . $filename . "\n";
     
        // Get some information about the file
        $pathInfo = pathinfo( $filename );
        $fileType = filetype( $filename );
     
        // Is this a directory?
        if( $fileType === 'dir' ) {
     
            // Output that it is
            echo "Investigating directory: " . $filename . "\n";
     
            // Is this directory empty?
            if( dir_is_empty( $filename ) ) {
     
                // Open the directory in PHP
                $handle = opendir( $filename );
     
                // Indicate this directory needs to go
                echo "     Removing directory: " . $filename . "\n";    
     
                // Close the directory in PHP - this allows it to be removed
                closedir( $handle );
     
                // Remove the directory
                rmdir( $filename );
     
                // Go to the next loop iteration
                continue;
     
            // Not empty, do nothing
            } else {
     
                // Put the filename back on the array at the end
                $files[] = $filename;
                continue;
            }
        }
     
        // Convert the lower bound time to a unix time
        $startTime = strtotime( $start );
     
        // Is the file older than the start time?
        if( filemtime( $filename ) < $startTime ) {
     
            // Indicate we're deleting it for being old
            echo '     Deleted due to age (' . date ( "F d Y H:i:s.", filemtime( $filename ) ) . ')\n';
     
            // Delete the file
            unlink( $filename );
     
            // Go to the next iteration of the loop
            continue;
        }
     
        // Set up a variable to hold the text of the document
        $text = '';
     
        // Do different things based on the file extension
        switch( $pathInfo[ 'extension' ] ) {
     
            // Is this a pdf?
            case 'pdf':
     
                // Attempt to parse the file using PdfParser
                try {
                    $parser = new \Smalot\PdfParser\Parser();
                    $pdf = $parser->parseFile( $filename );
                    $text = $pdf->getText();
     
                // This is a catch to ensure any secure pdfs (which can't be scanned)
                // are just assumed to be relevant
                } catch( Exception $e ) {
                    $text = $catchWord;
                }
     
                break;
     
            // Is it an office document?
            case 'docx':
            case 'xlsx':
            case 'pptx':
            case 'doc':
            	$docObj = new DocxConversion( $filename );
            	$text = $docObj->convertToText();
                break;
     
            // Everything else just dump the raw file into our text variable
            default:
                $text = file_get_contents( $filename );
                break;
     
        }
     
        // Check if the filename or file contents contain any of our words
    	if( contains( $filename, $words ) || contains( $text, $words ) ) {
     
    		// Get our destination directory
            $newdir = str_replace( '\files\\', '\saved\\', $pathInfo[ 'dirname'] );
     
            // Create our new filename using the new path
            $newname = $newdir . $pathInfo[ 'basename' ];
     
            // Check if the destination directory exists
            if (!file_exists( $newdir ) ) {
     
                // Create it
                mkdir( $newdir, 0777, true);
            }
     
    		// Try to copy the file	
    		if( !copy( $filename, $newname ) ) {
     
    			// copy returned false, which means we hit an error!
    			echo "     Error on copy: " . $filename . " to " . $newname . "\n";	
     
    		// Copy was fine
    		} else {
     
                // Indicate we're keeping the file
                echo "     KEEP: " . $filename . "\n";
     
    			// Delete the source file
    			unlink( $filename );
    		}
     
        // Words not found
    	} else {
     
            // Indicate we're deleting it
            echo "     DELETE: " . $filename . "\n";
     
    		// Delete the source file
    		unlink( $filename );
    	}
    }
     
     
    /*
     *  https://stackoverflow.com/questions/24783862/list-all-the-files-and-folders-in-a-directory-with-php-recursive-function
     */
    function getDirContents($dir, &$results = array()){
        $files = scandir($dir);
     
        foreach($files as $key => $value){
            $path = realpath($dir.DIRECTORY_SEPARATOR.$value);
     
            if(!is_dir($path)) {
                $results[] = $path;
            } else if($value != "." && $value != "..") {
                getDirContents($path, $results);
                $results[] = $path;
            }
        }
     
        return $results;
    }
     
     
    /*
     * https://stackoverflow.com/questions/7497733/how-can-i-use-php-to-check-if-a-directory-is-empty
     */
    function dir_is_empty($dir) {
      $handle = opendir($dir);
      while (false !== ($entry = readdir($handle))) {
        if ($entry != "." && $entry != "..") {
          closedir($handle);
          return FALSE;
        }
      }
      closedir($handle);
      return TRUE;
    }
     
     
    /*
     * https://stackoverflow.com/questions/13795789/check-if-string-contains-word-in-array
     */
    function contains($str, array $arr)
    {
        foreach($arr as $a) {
            if (stripos($str,$a) !== false) return true;
        }
        return false;
    }
     
     
        class DocxConversion{
        private $filename;
     
        public function __construct($filePath) {
            $this->filename = $filePath;
        }
     
        private function read_doc() {
            $fileHandle = fopen($this->filename, "r");
            $line = @fread($fileHandle, filesize($this->filename));   
            $lines = explode(chr(0x0D),$line);
            $outtext = "";
            foreach($lines as $thisline)
              {
                $pos = strpos($thisline, chr(0x00));
                if (($pos !== FALSE)||(strlen($thisline)==0))
                  {
                  } else {
                    $outtext .= $thisline." ";
                  }
              }
             $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r     @\/\_\(\)]/","",$outtext);
            return $outtext;
        }
     
        private function read_docx(){
     
            $striped_content = '';
            $content = '';
     
            $zip = zip_open($this->filename);
     
            if (!$zip || is_numeric($zip)) return false;
     
            while ($zip_entry = zip_read($zip)) {
     
                if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
     
                if (zip_entry_name($zip_entry) != "word/document.xml") continue;
     
                $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
     
                zip_entry_close($zip_entry);
            }// end while
     
            zip_close($zip);
     
            $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
            $content = str_replace('</w:r></w:p>', "\r\n", $content);
            $striped_content = strip_tags($content);
     
            return $striped_content;
        }
     
     
    /*
     * Functions from stackoverflow to change office stuff into text files
     */
    function xlsx_to_text($input_file){
        $xml_filename = "xl/sharedStrings.xml"; //content file name
        $zip_handle = new ZipArchive;
        $output_text = "";
        if(true === $zip_handle->open($input_file)){
            if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
                $xml_datas = $zip_handle->getFromIndex($xml_index);
                $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
                $output_text = strip_tags($xml_handle->saveXML());
            }else{
                $output_text .="";
            }
            $zip_handle->close();
        }else{
        $output_text .="";
        }
        return $output_text;
    }
     
    function pptx_to_text($input_file){
        $zip_handle = new ZipArchive;
        $output_text = "";
        if(true === $zip_handle->open($input_file)){
            $slide_number = 1; //loop through slide files
            while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
                $xml_datas = $zip_handle->getFromIndex($xml_index);
                $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
                $output_text .= strip_tags($xml_handle->saveXML());
                $slide_number++;
            }
            if($slide_number == 1){
                $output_text .="";
            }
            $zip_handle->close();
        }else{
        $output_text .="";
        }
        return $output_text;
    }
     
     
    public function convertToText() {
        if(isset($this->filename) && !file_exists($this->filename)) {
            return "File Not exists";
        }
     
        $fileArray = pathinfo($this->filename);
     
        $file_ext  = $fileArray['extension'];
     
        if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
            {
                if($file_ext == "doc") {
                    return $this->read_doc();
                } elseif($file_ext == "docx") {
                    return $this->read_docx();
                } elseif($file_ext == "xlsx") {
                    return $this->xlsx_to_text($this->filename);
                }elseif($file_ext == "pptx") {
                    return $this->pptx_to_text($this->filename);
                }
            } else {
                return "Invalid File Type";
            }
        }
    }