HEX
Server: Apache/2.4.65 (Debian)
System: Linux 88f31f35b0b8 6.1.0-38-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.147-1 (2025-08-02) x86_64
User: www-data (33)
PHP: 8.2.29
Disabled: NONE
Upload Files
File: /var/www/html/wp-content/plugins/broken-link-checker-seo/app/Links/Data.php
<?php
namespace AIOSEO\BrokenLinkChecker\Links;

// Exit if accessed directly.
if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

use AIOSEO\BrokenLinkChecker\Models;

/**
 * Handles the extraction, parsing and storage of links for the links scan.
 *
 * @since 1.0.0
 */
class Data {
	/**
	 * The ignored extensions.
	 *
	 * @since 1.0.0
	 *
	 * @var array
	 */
	private $ignoredExtensions = [];

	/**
	 * Class constructor.
	 *
	 * @since 1.0.0
	 */
	public function __construct() {
		$this->setIgnoredExtensions();
	}

	/**
	 * Indexes the links in the given post.
	 *
	 * @since 1.0.0
	 *
	 * @param  int  $postId The post ID.
	 * @return void
	 */
	public function indexLinks( $postId ) {
		$post = get_post( $postId );
		if ( ! is_a( $post, 'WP_Post' ) ) {
			return;
		}

		// Delete all links first. We have to do this in order to remove old links that no longer exist.
		Models\Link::deleteLinks( $postId );

		$links = $this->extractLinks( $postId, $post->post_content );
		if ( empty( $links ) ) {
			return;
		}

		$this->storeLinks( $links );
	}

	/**
	 * Stores the given links to the DB.
	 *
	 * @since 1.0.0
	 *
	 * @param  array $links The links.
	 * @return void
	 */
	private function storeLinks( $links ) {
		$columns    = [
			'post_id',
			'blc_link_status_id',
			'url',
			'url_hash',
			'hostname',
			'hostname_url',
			'external',
			'anchor',
			'phrase',
			'phrase_html',
			'paragraph',
			'paragraph_html',
			'created',
			'updated'
		];
		$currentDate = gmdate( 'Y-m-d H:i:s' );

		$urls = [];
		$rows = [];
		foreach ( $links as $linkData ) {
			$data = Models\Link::sanitizeLink( $linkData );
			if ( empty( $data ) ) {
				continue;
			}

			if ( ! Models\Link::validateLink( $data ) ) {
				continue;
			}

			$urls[ $data['url_hash'] ] = $data['url'];

			$rows[] = array_merge( array_values( $data ), [ $currentDate, $currentDate ] );
		}

		aioseoBrokenLinkChecker()->core->db->bulkInsert( 'aioseo_blc_links', $columns, $rows );

		$existing = aioseoBrokenLinkChecker()->core->db->start( 'aioseo_blc_link_status' )
			->select( 'url_hash' )
			->whereIn( 'url_hash', array_keys( $urls ) )
			->run()
			->result();

		foreach ( $existing as $row ) {
			unset( $urls[ $row->url_hash ] );
		}

		if ( empty( $urls ) ) {
			return;
		}

		foreach ( $urls as $hash => $url ) {
			$statusId = aioseoBrokenLinkChecker()->core->db->insert( 'aioseo_blc_link_status' )
				->set( [
					'url'      => $url,
					'url_hash' => $hash,
					'created'  => aioseoBrokenLinkChecker()->helpers->timeToMysql( time() ),
					'updated'  => aioseoBrokenLinkChecker()->helpers->timeToMysql( time() )
				] )
				->run()
				->insertId();

			aioseoBrokenLinkChecker()->core->db->update( 'aioseo_blc_links' )
				->where( 'url', $url )
				->set( [
					'blc_link_status_id' => $statusId
				] )
				->run();
		}
	}

	/**
	 * Returns the links that are in the post content.
	 *
	 * @since 1.0.0
	 *
	 * @param  int    $postId      The post ID.
	 * @param  string $postContent The post content.
	 * @return array               The links.
	 */
	private function extractLinks( $postId, $postContent ) {
		$postContent = aioseoBrokenLinkChecker()->helpers->decodeHtmlEntities( $postContent );

		// Strip data URIs to prevent catastrophic backtracking.
		$postContent = preg_replace( '/data:[^;]+;base64,[^"]+/', '', (string) $postContent );

		/**
		 * Regex pattern divided into groups:
		 * 0  - Full phrase with link tag.
		 * 2  - Start of the phrase, before the anchor.
		 * 4  - The URL.
		 * 6  - The anchor.
		 * 9  - The end of the phrase, after the anchor.
		 * 10 - The ending punctuation mark.
		 */
		preg_match_all(
			'/(([^\r\n.?!]*)<t?a[^>]*?href=(\"|\')(?!tel:|mailto:)([^\"\']*?)(\"|\')[^>]*?>([\s\w\W]*?)<\/t?a>|<!-- wp:core-embed\/wordpress {"url":"([^"]*?)"[^}]*?"} -->|(?:>|&nbsp;|\s)((?:(?:http|ftp|https)\:\/\/)(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))(?:<|&nbsp;|\s))([^<>.?!\r\n]*)([.?!]?)/i', // phpcs:disable Generic.Files.LineLength.MaxExceeded
			(string) $postContent,
			$matches
		);

		if ( empty( $matches[0] ) ) {
			return [];
		}

		$links = [];
		foreach ( $matches[0] as $k => $v ) {
			if ( empty( $matches[4][ $k ] ) || empty( $matches[6][ $k ] ) ) {
				continue;
			}

			$parsedUrl = $this->parseUrl( $matches[4][ $k ] );
			if ( empty( $parsedUrl['host'] ) ) {
				continue;
			}

			if (
				! empty( $parsedUrl['path'] ) &&
				preg_match( '/\.(.*?)$/i', $parsedUrl['path'], $extension ) &&
				! empty( $extension[1] ) &&
				in_array( $extension[1], $this->ignoredExtensions, true )
			) {
				continue;
			}

			// NOTE: We need to check this here before we strip off the "www" part.
			// Otherwise we will not be able to detect internal links on sites running on "www".
			$isInternal = $parsedUrl['host'] === $this->getHostname();

			$hostname = aioseoBrokenLinkChecker()->helpers->pregReplace( '/www\./i', '', $parsedUrl['host'] );
			$anchor   = wp_strip_all_tags( $matches[6][ $k ] );
			// Remove trailing URL tags. The regex isn't sufficient for this.
			$phrase = wp_strip_all_tags( $matches[0][ $k ] );
			$phrase = trim( preg_replace( '/(.*)(<t?a[^<>].*$)/', '', (string) $phrase ) );

			// Don't continue if the anchor or phrase are empty, e.g. blank link tag.
			if ( ! $anchor || ! $phrase ) {
				continue;
			}

			$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripIncompleteHtmlTags( $matches[0][ $k ] );
			$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripScriptTags( $phraseHtml );
			$phraseHtml = aioseoBrokenLinkChecker()->helpers->trimParagraphTags( $phraseHtml );

			if ( empty( $phraseHtml ) ) {
				continue;
			}

			$paragraph     = aioseoBrokenLinkChecker()->main->paragraph->get( $postId, $postContent, $phrase );
			$paragraphHtml = aioseoBrokenLinkChecker()->main->paragraph->getHtml( $anchor, $paragraph, $postContent );

			// Reformat the URL to get rid of params and fragments.
			$url = aioseoBrokenLinkChecker()->helpers->buildUrl( $parsedUrl, [], [ 'fragment' ] );

			// We need to sanitize the URL here so the hash is calculated based on the escaped version.
			$url = trim( sanitize_url( $url ) );
			$url = apply_filters( 'aioseo_blc_link_url_before_save', $url );

			$linkData = [
				'post_id'            => (int) $postId,
				'blc_link_status_id' => $this->getLinkStatusId( $url ),
				'url'                => $url,
				'url_hash'           => sha1( $url ),
				'hostname'           => $hostname,
				'hostname_url'       => sha1( $hostname ),
				'external'           => ! $isInternal,
				'anchor'             => $anchor,
				'phrase'             => $phrase,
				'phrase_html'        => $phraseHtml,
				'paragraph'          => $paragraph,
				'paragraph_html'     => $paragraphHtml
			];

			$links[] = $linkData;
		}

		return $links;
	}

	/**
	 * Return the link status ID.
	 *
	 * @since 1.0.0
	 *
	 * @param  string   $url The URL to look up.
	 * @return int|null      The link status ID.
	 */
	private function getLinkStatusId( $url ) {
		static $linkStatusId = [];

		$hash = sha1( $url );
		if ( isset( $linkStatusId[ $hash ] ) ) {
			return $linkStatusId[ $hash ];
		}

		$possibleLinkStatusId = aioseoBrokenLinkChecker()->core->db->start( 'aioseo_blc_link_status' )
			->where( 'url_hash', $hash )
			->run()
			->result();

		$linkStatusId[ $hash ] = ! empty( $possibleLinkStatusId ) ? $possibleLinkStatusId[0]->id : null;

		return $linkStatusId[ $hash ];
	}

	/**
	 * Returns the site's hostname.
	 *
	 * @since 1.0.0
	 *
	 * @return string The hostname.
	 */
	private function getHostname() {
		static $siteUrl = null;
		if ( null === $siteUrl ) {
			$siteUrl = wp_parse_url( get_site_url(), PHP_URL_HOST );
		}

		return $siteUrl;
	}

	/**
	 * Returns the parsed URL.
	 *
	 * @since 1.0.0
	 * @since 1.1.1 Renamed method.
	 *
	 * @param  string $url The URL.
	 * @return array       The parsed URL.
	 */
	private function parseUrl( $url ) {
		$parsedUrl = wp_parse_url( $url );
		if ( empty( $parsedUrl ) ) {
			return [];
		}

		// If the URL is relative, add the hostname of the site.
		if ( empty( $parsedUrl['host'] ) ) {
			$parsedUrl['host']   = $this->getHostname();
			$parsedUrl['scheme'] = wp_parse_url( get_site_url(), PHP_URL_SCHEME );
		}

		return $parsedUrl;
	}

	/**
	 * Returns the posts to scan.
	 *
	 * @since 1.0.0
	 *
	 * @param  bool      $countOnly Whether to return only the count.
	 * @return array|int            The posts to scan or a count.
	 */
	public function getPostsToScan( $countOnly = false ) {
		$postsPerScan        = apply_filters( 'aioseo_blc_links_posts_per_scan', 50 );
		$postTypes           = aioseoBrokenLinkChecker()->helpers->getScannablePostTypes();
		$postStatuses        = aioseoBrokenLinkChecker()->helpers->getPublicPostStatuses( true );
		// phpcs:ignore WordPress.DateTime.RestrictedFunctions.date_date
		$minimumLinkScanDate = esc_sql( aioseoBrokenLinkChecker()->internalOptions->internal->minimumLinkScanDate ?: date( 'Y-m-d H:i:s' ) );

		$query = aioseoBrokenLinkChecker()->core->db->start( 'posts as p' )
			->leftJoin( 'aioseo_blc_posts as abp', 'p.ID = abp.post_id' )
			->whereIn( 'p.post_status', $postStatuses )
			->whereIn( 'p.post_type', $postTypes )
			->whereRaw( "(
				abp.post_id IS NULL OR
				abp.link_scan_date < p.post_modified_gmt OR
				abp.link_scan_date IS NULL OR
				abp.link_scan_date < '$minimumLinkScanDate'
			)" );

		if ( $countOnly ) {
			return $query->count();
		}

		$postsToScan = $query
			->select( 'DISTINCT p.ID, p.post_content, p.post_type, p.post_status' )
			->limit( $postsPerScan )
			->run()
			->result();

		return $postsToScan;
	}

	/**
	 * Returns the total number of scannable posts.
	 *
	 * @since 1.0.0
	 *
	 * @return int The total number of scannable posts.
	 */
	private function getTotalScannablePosts() {
		$postTypes    = aioseoBrokenLinkChecker()->helpers->getScannablePostTypes();
		$postStatuses = aioseoBrokenLinkChecker()->helpers->getPublicPostStatuses( true );

		$query = aioseoBrokenLinkChecker()->core->db->start( 'posts as p' )
			->whereIn( 'p.post_status', $postStatuses )
			->whereIn( 'p.post_type', $postTypes );

		return $query->count();
	}

	/**
	 * Returns the scan percentage.
	 *
	 * @since 1.0.0
	 *
	 * @return int The scan percentage.
	 */
	public function getScanPercentage() {
		$postsToScan         = $this->getPostsToScan( true );
		$totalScannablePosts = $this->getTotalScannablePosts();
		if ( 0 === $postsToScan || 0 === $totalScannablePosts ) {
			return 100;
		}

		return ceil( 100 - ( ( $postsToScan / $totalScannablePosts ) * 100 ) );
	}

	/**
	 * Sets the ignored extensions.
	 *
	 * @since 1.0.0
	 *
	 * @return void
	 */
	private function setIgnoredExtensions() {
		$this->ignoredExtensions = apply_filters( 'aioseo_blc_ignored_extensions', [
			// Executable files
			'apk',
			'bat',
			'bin',
			'cgi',
			'com',
			'exe',
			'gadget',
			'jar',
			'py',
			'wsf',
		] );
	}
}