Image description

Mencoba untuk melakukan scrape masjid di simas kemenag menggunakan PHP dan multi threads dengan multi curl

Scrape akan melakukan scrape masjid id dan lokasi masjid atau koordinatnya di gmaps.

Scriptnya:

$startPage   = 1;
$endPage     = 100;
$maxThreads  = 10;
$namaFile    = "hasil_masjid_page{$startPage}-{$endPage}.txt";

// Buka file dan tulis header
$fp = fopen($namaFile, 'w');
fwrite($fp, "\"s_id\",\"link_detil\",\"link_peta\"\n");

// Ambil banyak halaman secara paralel
function fetchPagesMultiCurl($urls) {
    $multiHandle = curl_multi_init();
    $curlHandles = [];
    $results = [];

    foreach ($urls as $key => $url) {
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_USERAGENT => 'Mozilla/5.0',
            CURLOPT_TIMEOUT => 10,
        ]);
        curl_multi_add_handle($multiHandle, $ch);
        $curlHandles[$key] = $ch;
    }

    $running = null;
    do {
        curl_multi_exec($multiHandle, $running);
        curl_multi_select($multiHandle, 1.0); // Optimasi I/O blocking
    } while ($running > 0);

    foreach ($curlHandles as $key => $ch) {
        $results[$key] = curl_multi_getcontent($ch);
        curl_multi_remove_handle($multiHandle, $ch);
        curl_close($ch);
    }

    curl_multi_close($multiHandle);
    return $results;
}

// Batch processing
for ($batchStart = $startPage; $batchStart <= $endPage; $batchStart += $maxThreads) {
    $batchEnd = min($batchStart + $maxThreads - 1, $endPage);
    $urls = [];

    for ($i = $batchStart; $i <= $batchEnd; $i++) {
        $urls[$i] = "https://simas.kemenag.go.id/page/profilmasjid/0/0/0/0/0?page=$i";
    }

    printf("🔄 Memproses halaman %d - %d\n", $batchStart, $batchEnd);
    $responses = fetchPagesMultiCurl($urls);

    foreach ($responses as $page => $html) {
        if (!$html) {
            echo "⚠️ Gagal mengambil halaman $page\n";
            continue;
        }

        libxml_use_internal_errors(true); // suppress warning
        $dom = new DOMDocument();
        $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();

        $xpath = new DOMXPath($dom);
        $linksDetil = $xpath->query("//a[contains(text(), 'Lihat Detil')]");
        $linksPeta  = $xpath->query("//a[contains(text(), 'Lihat di Peta')]");

        $limit = min($linksDetil->length, $linksPeta->length);

        for ($i = 0; $i < $limit; $i++) {
            $detilHref = $linksDetil->item($i)->getAttribute('href');
            $petaHref  = $linksPeta->item($i)->getAttribute('href');

            $row = [
                "{$page}-" . ($i + 1),
                "https://simas.kemenag.go.id" . $detilHref,
                $petaHref
            ];

            fwrite($fp, '"' . implode('","', $row) . '"' . "\n");
        }

        // Bebaskan memori DOM
        unset($dom, $xpath, $linksDetil, $linksPeta);
    }
}

fclose($fp);
echo "🎉 Selesai! File berhasil dibuat: $namaFile\n";

Setelah scrape per batch yang bisa diatur maka akan menyimpan ke file txt dengan format csv

cara menjalankannya adalah dengan menjalankan di CMD atau terminal

nohup PHP scrape.php

Semoga bermanfaat

Image description