Crawler data website by simple html dom

require_once 'simple_html_dom.php';

function Generate_Featured_Image( $image_url, $post_id  ){
    $upload_dir = wp_upload_dir();
    $image_data = file_get_contents($image_url);
    $filename = basename($image_url);
    if(wp_mkdir_p($upload_dir['path'])){
		$file = $upload_dir['path'] . '/' . $filename;
	} else {
		$file = $upload_dir['basedir'] . '/' . $filename;
	}

    file_put_contents($file, $image_data);
 
    $wp_filetype = wp_check_filetype($filename, null );
    $attachment = array(
        'post_mime_type' => $wp_filetype['type'],
        'post_title' => sanitize_file_name($filename),
        'post_content' => '',
        'post_status' => 'inherit'
    );
    $attach_id = wp_insert_attachment( $attachment, $file, $post_id );
    require_once(ABSPATH . 'wp-admin/includes/image.php');
    $attach_data = wp_generate_attachment_metadata( $attach_id, $file );
    $res1= wp_update_attachment_metadata( $attach_id, $attach_data );
    $res2= set_post_thumbnail( $post_id, $attach_id );
}

function add_crawl_data(){
	if (!is_admin() && isset($_GET['act']) && $_GET['act']=='crawl'){
		$html_content = file_get_html('https://vnisinvestment.com/tin-tuc');
		$list_post = $html_content->find('#tth-content .list_item .row_item .col_item');
	
		if ( ! function_exists( 'post_exists' ) ) {
		    require_once( ABSPATH . 'wp-admin/includes/post.php' );
		}

		if (!empty($list_post)){
		  foreach ($list_post as $post){
		    	$post_link = $post->find('.news_img a', 0)->href;
		    	$thumb = $post->find('.news_img a .img');

				foreach($thumb as $a0) {
					$style = $a0->style;
					preg_match('/\(([^)]+)\)/', $style, $match);
					$src[$i++] = $match[1];
					$img = str_replace( "'", "", $match[1] );
				}

		    	//Post Detail
		    	$html_detail = file_get_html($post_link);
		    	$title = $html_detail->find('#item_detail h1', 0)->plaintext;
		    	$content = $html_detail->find('#item_detail .item-content', 0)->innertext;
		    	$date = $html_detail->find('#item_detail .item-date .date', 0)->innertext;
		    	$time = $html_detail->find('#item_detail .item-date .time', 0)->innertext;
				$date1 = str_replace('/', '-', $date);
				$date2 = date('Y-m-d', strtotime($date1));
				$date_time = $date2 .' '. $time;

		    	if (post_exists($title)===0){
					$args = array(
						'post_type' => 'post',
						'post_status' => 'publish',
						'post_title' => $title,
						'post_content' => $content,
						'post_date' => $date_time
					);

					$post_id = wp_insert_post($args);

					if ($post_id>0){
						wp_set_post_terms($post_id, 1, 'category');
						Generate_Featured_Image($img, $post_id);
					}
				}
		   }
		}

		die();
	}
}

add_action('init', 'add_crawl_data');

Leave a Reply

Your email address will not be published. Required fields are marked *