PHP查询csv数据库
相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。
借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。
常规编程,单线程,单进程,顺序执行。
<?php
class Searcher{
protected $fileList = [];
public function __construct()
{
$this->currentTasks[0] = 'main';
$dir = dir( dirname(__FILE__).'/2000W' );
while ( ($file = $dir->read() )!=FALSE ){
if ( strpos($file,'.csv') >0 ){
$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
}
}
$dir->close();
}
public function run($word)
{
$start_time = time();
foreach($this->fileList as $file){
findInFile($file,$word);
}
echo "Finished in ", time()-$start_time , "seconds.\n";
}
}
function findInFile($file,$word){
echo "start searching $word in $file \n";
$fp = fopen($file,'rb');
$i = 0;
while (!feof($fp)){
$i ++;
$line = fgets($fp);
if ( strstr($line,$word) ){
echo "$file [$l]: $line \n ";
}
}
fclose($fp);
echo "Finish searching in $file \n ";
}
$obj = new Searcher();
$obj->run('张三');
现在尝试一下用pnctl扩展实现多进程:
<?php
class Searcher{
protected $fileList = [];
public function __construct()
{
$this->currentTasks[0] = 'main';
$dir = dir( dirname(__FILE__).'/2000W' );
while ( ($file = $dir->read() )!=FALSE ){
if ( strpos($file,'.csv') >0 ){
$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
}
}
$dir->close();
}
public function runMultiProcess($word)
{
echo "Begin...\n";
$start_time = time();
foreach($this->fileList as $file){
$lanched = $this->runProcess($file,$word);
}
//wait for all child processes to finish
while ( pcntl_waitpid(0,$status) != -1){
$status = pcntl_wexitstatus($status);
echo "Child($status) Exit.\n";
}
$cost = time() -$start_time;
echo "All tasks finished in $cost secends.\n";
}
protected function runProcess($file,$word)
{
$pid = pcntl_fork();
if ($pid == -1 ){ //error occurs
echo "error occurs while fork new task for [$file]\n";
return false;
}elseif($pid){ // parent process
//echo "in parent process.\n";
}else{//forked child
$mypid = getmypid();
echo "start child process $mypid to read file $file \n";
$exitStatus = $mypid;
findInFile($file,$word);
exit($exitStatus);
}
return true;
}
}
function findInFile($file,$word){
echo "start searching $word in $file \n";
$fp = fopen($file,'rb');
$i = 0;
while (!feof($fp)){
$i ++;
$line = fgets($fp);
if ( strstr($line,$word) ){
echo "$file [$l]: $line \n ";
}
}
fclose($fp);
echo "Finish searching in $file \n ";
}
$obj = new Searcher();
$obj->runMultiProcess('张三');
进程是个比较烦琐的东西,什么信号啊之类的是最难掌握的,更重要的是各进程之间相互通讯比较困难。
现在再试试基于pthreads扩展的多线程
<?php
class Searcher{
protected $fileList = [];
public function __construct()
{
$this->currentTasks[0] = 'main';
$dir = dir( dirname(__FILE__).'/2000W' );
while ( ($file = $dir->read() )!=FALSE ){
if ( strpos($file,'.csv') >0 ){
$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
}
}
$dir->close();
}
public function runMultiThread($word)
{
$start_time = time();
$ths = [];
foreach($this->fileList as $i=>$file){
$ths[$i] = new ThSearch($file,$word);
$ths[$i]->start();
}
foreach($ths as $th){
$th->join();
}
echo "Finished in ", time()-$start_time , "seconds.\n";
}
}
function findInFile($file,$word){
echo "start searching $word in $file \n";
$fp = fopen($file,'rb');
$i = 0;
while (!feof($fp)){
$i ++;
$line = fgets($fp);
if ( strstr($line,$word) ){
echo "$file [$l]: $line \n ";
}
}
fclose($fp);
echo "Finish searching in $file \n ";
}
class ThSearch extends Thread{
protected $file;
protected $word;
public function __construct($file,$word)
{
$this->file = $file;
$this->word = $word;
}
public function run()
{
findInFile($this->file,$this->word);
}
}
$obj = new Searcher();
$obj->runMultiThread('张三');
线程相比进程要轻得多,而且也更好控制。
也许你会发现这三种方法性能为什么都差不多? 因为在这个例子中,性能的瓶颈主要在磁盘IO,硬盘读写上限就那么多,自然性能也相差无几了。
from https://gebsni.com/a/13/10/how-to-search-person-from-200-million-data.html