<?php // Import our mail parser require_once('MimeMailParser.class.php'); // The list of words we want to keep $words = array( 'bail', 'pretrial release', 'pretrial justice', 'bond', 'bondsman', 'custody release', 'house arrest with electronic monitoring', 'written promise', 'surety', 'jail', 'jailer', 'juvenile detention center', ); // Get all of the files $files = getDirContents( getcwd() . '/emails' ); for( $i = 0; $i < count( $files ); $i++ ) { $move = false; $filename = $files[ $i ]; echo 'Working on: ' . $filename . '<br>'; $parser = new MimeMailParser(); $parser->setPath( $filename ); $subject = $parser->getHeader( 'subject' ); $text = $parser->getMessageBody( 'text' ); if( contains( $subject, $words ) || contains( $text, $words ) ) { unset( $parser ); // Get our destination directory $newname = str_replace( 'C:\wamp64\www\email\emails', 'C:\wamp64\www\email\SAVED-EMAILS', $filename ); // Try to copy the file if( !copy( $filename, $newname ) ) { // copy returned false, which means we hit an error! echo 'Error on copy: ' . $filename . ' to ' . $newname . '<br>'; // Copy was fine } else { // Delete the source file unlink( $filename ); } } else { unset( $parser ); // Delete the source file unlink( $filename ); } } /* $path = 'test.eml'; $parser = new MimeMailParser(); $parser->setPath( $path ); $subject = $parser->getHeader( 'subject' ); $text = $parser->getMessageBody( 'text' ); echo $subject; echo '<br>'; echo $text; */ /* * https://stackoverflow.com/questions/24783862/list-all-the-files-and-folders-in-a-directory-with-php-recursive-function */ function getDirContents($dir, &$results = array()){ $files = scandir($dir); foreach($files as $key => $value){ $path = realpath($dir.DIRECTORY_SEPARATOR.$value); if(!is_dir($path)) { $results[] = $path; } else if($value != "." && $value != "..") { getDirContents($path, $results); $results[] = $path; } } return $results; } /* * https://stackoverflow.com/questions/13795789/check-if-string-contains-word-in-array */ function contains($str, array $arr) { foreach($arr as $a) { if (stripos($str,$a) !== false) return true; } return false; }
Public Records Request Tools
These are various scripts written by Nicole to assist the help desk in doing public records requests.
-
email.php
This script parses all the .eml files in a directory tree and attempts to scan them for a given list of words. It requires the MimeMailParser class.
-
files.php
This is a script that reads all the files in a directory tree and attempts to determine whether or not they contain any of the words in a list.<?php /* * This file is best run from the command line * in the root directory of the files that need * to be scanned. */ /* composer.json contents: { "require": { "smalot/pdfparser": "*" } } */ // Set the memory limit to something stupidly high so we can load large files ini_set('memory_limit','128G'); // Include the composer autoloader. include 'vendor/autoload.php'; // The list of words we want to keep $words = array( 'judicial standards commission', 'hunt gwynn', 'gwynn', ); // The fallback word which will be used to "force" a document to be saved when // it cannot be read. This fixes problems with "secure" PDFs. $catchWord = 'gwynn'; // Only keep files on or after this date $start = '2018-06-01'; // Get all of the files $files = getDirContents( getcwd() . '/files' ); // Keep running while we have stuff to do while( count( $files ) > 0 ) { // Remove the first filename on the array $filename = array_shift( $files ); // Windows loves to make stuff read only, fix that chmod( $filename, 0777 ); // Output what file we're working on echo "Working on: " . $filename . "\n"; // Get some information about the file $pathInfo = pathinfo( $filename ); $fileType = filetype( $filename ); // Is this a directory? if( $fileType === 'dir' ) { // Output that it is echo "Investigating directory: " . $filename . "\n"; // Is this directory empty? if( dir_is_empty( $filename ) ) { // Open the directory in PHP $handle = opendir( $filename ); // Indicate this directory needs to go echo " Removing directory: " . $filename . "\n"; // Close the directory in PHP - this allows it to be removed closedir( $handle ); // Remove the directory rmdir( $filename ); // Go to the next loop iteration continue; // Not empty, do nothing } else { // Put the filename back on the array at the end $files[] = $filename; continue; } } // Convert the lower bound time to a unix time $startTime = strtotime( $start ); // Is the file older than the start time? if( filemtime( $filename ) < $startTime ) { // Indicate we're deleting it for being old echo ' Deleted due to age (' . date ( "F d Y H:i:s.", filemtime( $filename ) ) . ')\n'; // Delete the file unlink( $filename ); // Go to the next iteration of the loop continue; } // Set up a variable to hold the text of the document $text = ''; // Do different things based on the file extension switch( $pathInfo[ 'extension' ] ) { // Is this a pdf? case 'pdf': // Attempt to parse the file using PdfParser try { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile( $filename ); $text = $pdf->getText(); // This is a catch to ensure any secure pdfs (which can't be scanned) // are just assumed to be relevant } catch( Exception $e ) { $text = $catchWord; } break; // Is it an office document? case 'docx': case 'xlsx': case 'pptx': case 'doc': $docObj = new DocxConversion( $filename ); $text = $docObj->convertToText(); break; // Everything else just dump the raw file into our text variable default: $text = file_get_contents( $filename ); break; } // Check if the filename or file contents contain any of our words if( contains( $filename, $words ) || contains( $text, $words ) ) { // Get our destination directory $newdir = str_replace( '\files\\', '\saved\\', $pathInfo[ 'dirname'] ); // Create our new filename using the new path $newname = $newdir . $pathInfo[ 'basename' ]; // Check if the destination directory exists if (!file_exists( $newdir ) ) { // Create it mkdir( $newdir, 0777, true); } // Try to copy the file if( !copy( $filename, $newname ) ) { // copy returned false, which means we hit an error! echo " Error on copy: " . $filename . " to " . $newname . "\n"; // Copy was fine } else { // Indicate we're keeping the file echo " KEEP: " . $filename . "\n"; // Delete the source file unlink( $filename ); } // Words not found } else { // Indicate we're deleting it echo " DELETE: " . $filename . "\n"; // Delete the source file unlink( $filename ); } } /* * https://stackoverflow.com/questions/24783862/list-all-the-files-and-folders-in-a-directory-with-php-recursive-function */ function getDirContents($dir, &$results = array()){ $files = scandir($dir); foreach($files as $key => $value){ $path = realpath($dir.DIRECTORY_SEPARATOR.$value); if(!is_dir($path)) { $results[] = $path; } else if($value != "." && $value != "..") { getDirContents($path, $results); $results[] = $path; } } return $results; } /* * https://stackoverflow.com/questions/7497733/how-can-i-use-php-to-check-if-a-directory-is-empty */ function dir_is_empty($dir) { $handle = opendir($dir); while (false !== ($entry = readdir($handle))) { if ($entry != "." && $entry != "..") { closedir($handle); return FALSE; } } closedir($handle); return TRUE; } /* * https://stackoverflow.com/questions/13795789/check-if-string-contains-word-in-array */ function contains($str, array $arr) { foreach($arr as $a) { if (stripos($str,$a) !== false) return true; } return false; } class DocxConversion{ private $filename; public function __construct($filePath) { $this->filename = $filePath; } private function read_doc() { $fileHandle = fopen($this->filename, "r"); $line = @fread($fileHandle, filesize($this->filename)); $lines = explode(chr(0x0D),$line); $outtext = ""; foreach($lines as $thisline) { $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0)) { } else { $outtext .= $thisline." "; } } $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r @\/\_\(\)]/","",$outtext); return $outtext; } private function read_docx(){ $striped_content = ''; $content = ''; $zip = zip_open($this->filename); if (!$zip || is_numeric($zip)) return false; while ($zip_entry = zip_read($zip)) { if (zip_entry_open($zip, $zip_entry) == FALSE) continue; if (zip_entry_name($zip_entry) != "word/document.xml") continue; $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); zip_entry_close($zip_entry); }// end while zip_close($zip); $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); $content = str_replace('</w:r></w:p>', "\r\n", $content); $striped_content = strip_tags($content); return $striped_content; } /* * Functions from stackoverflow to change office stuff into text files */ function xlsx_to_text($input_file){ $xml_filename = "xl/sharedStrings.xml"; //content file name $zip_handle = new ZipArchive; $output_text = ""; if(true === $zip_handle->open($input_file)){ if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text = strip_tags($xml_handle->saveXML()); }else{ $output_text .=""; } $zip_handle->close(); }else{ $output_text .=""; } return $output_text; } function pptx_to_text($input_file){ $zip_handle = new ZipArchive; $output_text = ""; if(true === $zip_handle->open($input_file)){ $slide_number = 1; //loop through slide files while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= strip_tags($xml_handle->saveXML()); $slide_number++; } if($slide_number == 1){ $output_text .=""; } $zip_handle->close(); }else{ $output_text .=""; } return $output_text; } public function convertToText() { if(isset($this->filename) && !file_exists($this->filename)) { return "File Not exists"; } $fileArray = pathinfo($this->filename); $file_ext = $fileArray['extension']; if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx") { if($file_ext == "doc") { return $this->read_doc(); } elseif($file_ext == "docx") { return $this->read_docx(); } elseif($file_ext == "xlsx") { return $this->xlsx_to_text($this->filename); }elseif($file_ext == "pptx") { return $this->pptx_to_text($this->filename); } } else { return "Invalid File Type"; } } }
All Systems Operational
Our operations staff has determined that all major ITD services are operating normally.