
Mencoba untuk melakukan scrape masjid di simas kemenag menggunakan PHP dan multi threads dengan multi curl
Scrape akan melakukan scrape masjid id dan lokasi masjid atau koordinatnya di gmaps.
Scriptnya:
$startPage   = 1;
$endPage     = 100;
$maxThreads  = 10;
$namaFile    = "hasil_masjid_page{$startPage}-{$endPage}.txt";
// Buka file dan tulis header
$fp = fopen($namaFile, 'w');
fwrite($fp, "\"s_id\",\"link_detil\",\"link_peta\"\n");
// Ambil banyak halaman secara paralel
function fetchPagesMultiCurl($urls) {
    $multiHandle = curl_multi_init();
    $curlHandles = [];
    $results = [];
    foreach ($urls as $key => $url) {
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_USERAGENT => 'Mozilla/5.0',
            CURLOPT_TIMEOUT => 10,
        ]);
        curl_multi_add_handle($multiHandle, $ch);
        $curlHandles[$key] = $ch;
    }
    $running = null;
    do {
        curl_multi_exec($multiHandle, $running);
        curl_multi_select($multiHandle, 1.0); // Optimasi I/O blocking
    } while ($running > 0);
    foreach ($curlHandles as $key => $ch) {
        $results[$key] = curl_multi_getcontent($ch);
        curl_multi_remove_handle($multiHandle, $ch);
        curl_close($ch);
    }
    curl_multi_close($multiHandle);
    return $results;
}
// Batch processing
for ($batchStart = $startPage; $batchStart <= $endPage; $batchStart += $maxThreads) {
    $batchEnd = min($batchStart + $maxThreads - 1, $endPage);
    $urls = [];
    for ($i = $batchStart; $i <= $batchEnd; $i++) {
        $urls[$i] = "https://simas.kemenag.go.id/page/profilmasjid/0/0/0/0/0?page=$i";
    }
    printf("🔄 Memproses halaman %d - %d\n", $batchStart, $batchEnd);
    $responses = fetchPagesMultiCurl($urls);
    foreach ($responses as $page => $html) {
        if (!$html) {
            echo "⚠️ Gagal mengambil halaman $page\n";
            continue;
        }
        libxml_use_internal_errors(true); // suppress warning
        $dom = new DOMDocument();
        $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();
        $xpath = new DOMXPath($dom);
        $linksDetil = $xpath->query("//a[contains(text(), 'Lihat Detil')]");
        $linksPeta  = $xpath->query("//a[contains(text(), 'Lihat di Peta')]");
        $limit = min($linksDetil->length, $linksPeta->length);
        for ($i = 0; $i < $limit; $i++) {
            $detilHref = $linksDetil->item($i)->getAttribute('href');
            $petaHref  = $linksPeta->item($i)->getAttribute('href');
            $row = [
                "{$page}-" . ($i + 1),
                "https://simas.kemenag.go.id" . $detilHref,
                $petaHref
            ];
            fwrite($fp, '"' . implode('","', $row) . '"' . "\n");
        }
        // Bebaskan memori DOM
        unset($dom, $xpath, $linksDetil, $linksPeta);
    }
}
fclose($fp);
echo "🎉 Selesai! File berhasil dibuat: $namaFile\n";Setelah scrape per batch yang bisa diatur maka akan menyimpan ke file txt dengan format csv
cara menjalankannya adalah dengan menjalankan di CMD atau terminal
nohup PHP scrape.phpSemoga bermanfaat
