Mencoba untuk melakukan scrape masjid di simas kemenag menggunakan PHP dan multi threads dengan multi curl
Scrape akan melakukan scrape masjid id dan lokasi masjid atau koordinatnya di gmaps.
Scriptnya:
$startPage = 1;
$endPage = 100;
$maxThreads = 10;
$namaFile = "hasil_masjid_page{$startPage}-{$endPage}.txt";
// Buka file dan tulis header
$fp = fopen($namaFile, 'w');
fwrite($fp, "\"s_id\",\"link_detil\",\"link_peta\"\n");
// Ambil banyak halaman secara paralel
function fetchPagesMultiCurl($urls) {
$multiHandle = curl_multi_init();
$curlHandles = [];
$results = [];
foreach ($urls as $key => $url) {
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0',
CURLOPT_TIMEOUT => 10,
]);
curl_multi_add_handle($multiHandle, $ch);
$curlHandles[$key] = $ch;
}
$running = null;
do {
curl_multi_exec($multiHandle, $running);
curl_multi_select($multiHandle, 1.0); // Optimasi I/O blocking
} while ($running > 0);
foreach ($curlHandles as $key => $ch) {
$results[$key] = curl_multi_getcontent($ch);
curl_multi_remove_handle($multiHandle, $ch);
curl_close($ch);
}
curl_multi_close($multiHandle);
return $results;
}
// Batch processing
for ($batchStart = $startPage; $batchStart <= $endPage; $batchStart += $maxThreads) {
$batchEnd = min($batchStart + $maxThreads - 1, $endPage);
$urls = [];
for ($i = $batchStart; $i <= $batchEnd; $i++) {
$urls[$i] = "https://simas.kemenag.go.id/page/profilmasjid/0/0/0/0/0?page=$i";
}
printf("🔄 Memproses halaman %d - %d\n", $batchStart, $batchEnd);
$responses = fetchPagesMultiCurl($urls);
foreach ($responses as $page => $html) {
if (!$html) {
echo "⚠️ Gagal mengambil halaman $page\n";
continue;
}
libxml_use_internal_errors(true); // suppress warning
$dom = new DOMDocument();
$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
$linksDetil = $xpath->query("//a[contains(text(), 'Lihat Detil')]");
$linksPeta = $xpath->query("//a[contains(text(), 'Lihat di Peta')]");
$limit = min($linksDetil->length, $linksPeta->length);
for ($i = 0; $i < $limit; $i++) {
$detilHref = $linksDetil->item($i)->getAttribute('href');
$petaHref = $linksPeta->item($i)->getAttribute('href');
$row = [
"{$page}-" . ($i + 1),
"https://simas.kemenag.go.id" . $detilHref,
$petaHref
];
fwrite($fp, '"' . implode('","', $row) . '"' . "\n");
}
// Bebaskan memori DOM
unset($dom, $xpath, $linksDetil, $linksPeta);
}
}
fclose($fp);
echo "🎉 Selesai! File berhasil dibuat: $namaFile\n";
Setelah scrape per batch yang bisa diatur maka akan menyimpan ke file txt dengan format csv
cara menjalankannya adalah dengan menjalankan di CMD atau terminal
nohup PHP scrape.php
Semoga bermanfaat