From 31e5c6455065f729d4903a4bdffa083778dd0bf6 Mon Sep 17 00:00:00 2001 From: popov654 Date: Sat, 27 Jul 2024 01:02:58 +0300 Subject: [PATCH 1/4] Implemented nth-child pseudoclass support --- composer.json | 11 +++-- src/PHPHtmlParser/Dom/Node/InnerNode.php | 11 +++++ src/PHPHtmlParser/Selector/Parser.php | 29 +++++++++++-- src/PHPHtmlParser/Selector/Seeker.php | 55 ++++++++++++------------ src/PHPHtmlParser/Selector/Selector.php | 2 +- tests/Selector/SeekerTest.php | 40 +++++++++++++++++ 6 files changed, 114 insertions(+), 34 deletions(-) diff --git a/composer.json b/composer.json index 166886f7..024c8e95 100755 --- a/composer.json +++ b/composer.json @@ -24,15 +24,20 @@ "myclabs/php-enum": "^1.7" }, "require-dev": { - "phpunit/phpunit": "^7.5.1", + "phpunit/phpunit": "^10.5", "mockery/mockery": "^1.2", - "infection/infection": "^0.13.4", - "phan/phan": "^2.4", + "infection/infection": ">=0.13.4", + "phan/phan": ">=2.4", "friendsofphp/php-cs-fixer": "^2.16" }, "autoload": { "psr-4": { "PHPHtmlParser\\": "src/PHPHtmlParser" } + }, + "config": { + "allow-plugins": { + "infection/extension-installer": true + } } } diff --git a/src/PHPHtmlParser/Dom/Node/InnerNode.php b/src/PHPHtmlParser/Dom/Node/InnerNode.php index 448057a7..e62e77f6 100644 --- a/src/PHPHtmlParser/Dom/Node/InnerNode.php +++ b/src/PHPHtmlParser/Dom/Node/InnerNode.php @@ -100,6 +100,17 @@ public function countChildren(): int return \count($this->children); } + public function childNodes(): array + { + return $this->children; + } + + public function childElements(): array + { + return array_values(array_filter($this->getChildren(), function ($el) { + return !$el->isTextNode(); + })); + } /** * Adds a child node to this node and returns the id of the child for this * parent. diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 4643c467..411af941 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -19,7 +19,7 @@ class Parser implements ParserInterface * * @var string */ - private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + private $pattern = "/([\w:*>+~-]*(?:\([\w\d]+\))?)(?:#([\w-]+)|\.([\w\.-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; /** * Parses the selector string. @@ -58,11 +58,34 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $value = \explode('.', $match[3]); } + // check for pseudoclass selector + if (strpos($match[0], ':') !== false) { + $pos = strpos($match[0], ':'); + $key = 'pseudoclass'; + $tag = $pos > 0 ? substr($match[0], 0, $pos) : '*'; + $value = \substr($match[0], $pos+1); + + if (\trim($value, ', ') == 'first-child') { + $value = 'nth-child(1)'; + } + else if (\trim($value, ', ') == 'last-child') { + $value = 'nth-last-child(1)'; + } + + if (preg_match("/^nth-child\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^nth-child\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = (int) $matches[0][1]; + } else if (preg_match("/^nth-last-child\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^nth-last-child\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = - (int) $matches[0][1]; + } + } + // and final attribute selector - if (!empty($match[4])) { + else if (!empty($match[4])) { $key = \strtolower($match[4]); } - if (!empty($match[5])) { + else if (!empty($match[5])) { $operator = $match[5]; } if (!empty($match[6])) { diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index abd6dc4e..b91d3961 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -6,6 +6,7 @@ use PHPHtmlParser\Contracts\Selector\SeekerInterface; use PHPHtmlParser\Dom\Node\AbstractNode; +use PHPHtmlParser\Dom\Node\HtmlNode; use PHPHtmlParser\Dom\Node\InnerNode; use PHPHtmlParser\Dom\Node\LeafNode; use PHPHtmlParser\DTO\Selector\RuleDTO; @@ -23,24 +24,6 @@ class Seeker implements SeekerInterface */ public function seek(array $nodes, RuleDTO $rule, array $options): array { - // XPath index - if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { - $count = 0; - foreach ($nodes as $node) { - if ($rule->getTag() == '*' - || $rule->getTag() == $node->getTag() - ->name() - ) { - ++$count; - if ($count == $rule->getKey()) { - // found the node we wanted - return [$node]; - } - } - } - - return []; - } $options = $this->flattenOptions($options); @@ -62,16 +45,34 @@ public function seek(array $nodes, RuleDTO $rule, array $options): array continue; } - $pass = $this->checkTag($rule, $child); - if ($pass && $rule->getKey() !== null) { - $pass = $this->checkKey($rule, $child); + if (!$child instanceof HtmlNode) { + $child = $this->getNextChild($node, $child); + continue; } - if ($pass && - $rule->getKey() !== null && - $rule->getValue() !== null && - $rule->getValue() != '*' - ) { - $pass = $this->checkComparison($rule, $child); + + $pass = true; + + if ($rule->getTag() !== null && \is_numeric($rule->getKey()) && $node instanceof HtmlNode) { + if (strpos($rule->getValue(), 'nth-') === 0) { + $children = $node->childElements(); + $n = $rule->getKey() < 0 ? count($children) + $rule->getKey() : $rule->getKey()-1; + $pass = $n >= 0 && $n < count($children) && $child == $children[$n]; + } + } + + if ($pass) { + $pass = $this->checkTag($rule, $child); + if ($pass && $rule->getKey() !== null && !\is_numeric($rule->getKey())) { + $pass = $this->checkKey($rule, $child); + } + if ($pass && + $rule->getKey() !== null && + $rule->getValue() !== null && + $rule->getValue() != '*' && + !\is_numeric($rule->getKey()) + ) { + $pass = $this->checkComparison($rule, $child); + } } if ($pass) { diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 697fb9cd..53c26e05 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -71,7 +71,7 @@ public function find(AbstractNode $node): Collection $options = []; foreach ($selector->getRules() as $rule) { - if ($rule->isAlterNext()) { + if ($rule->isAlterNext() && $rule->getTag() == '>') { $options[] = $this->alterNext($rule); continue; } diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index d9e0e824..18e7c7ab 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -2,6 +2,7 @@ declare(strict_types=1); +use PHPHtmlParser\Dom\Node\HtmlNode; use PHPHtmlParser\DTO\Selector\RuleDTO; use PHPHtmlParser\Selector\Seeker; use PHPUnit\Framework\TestCase; @@ -22,4 +23,43 @@ public function testSeekReturnEmptyArray() $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); } + + public function testSeekNthChild() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '*', + '=', + 1, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '*', + '=', + -1, + null, + false, + false + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } } From 3f2a048cc22bad8f144fac76c79606c357bd6a98 Mon Sep 17 00:00:00 2001 From: popov654 Date: Sat, 27 Jul 2024 01:03:45 +0300 Subject: [PATCH 2/4] Implemented nth-of-type pseudoclass support --- src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 26 +++++++--- src/PHPHtmlParser/Dom/Node/InnerNode.php | 8 +++ src/PHPHtmlParser/Selector/Parser.php | 23 ++++++--- src/PHPHtmlParser/Selector/Seeker.php | 10 ++-- tests/Selector/SeekerTest.php | 57 ++++++++++++++++++++++ 5 files changed, 105 insertions(+), 19 deletions(-) diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php index 5299e3a0..6a3b0b3c 100644 --- a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -36,6 +36,11 @@ final class RuleDTO */ private $alterNext; + /** + * @var bool + */ + private $isNthOfType; + private function __construct(array $values) { $this->tag = $values['tag']; @@ -44,21 +49,23 @@ private function __construct(array $values) $this->value = $values['value']; $this->noKey = $values['noKey']; $this->alterNext = $values['alterNext']; + $this->isNthOfType = $values['isNthOfType']; } /** * @param string|array|null $key * @param string|array|null $value */ - public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext): RuleDTO + public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext, bool $isNthOfType = false): RuleDTO { return new RuleDTO([ - 'tag' => $tag, - 'operator' => $operator, - 'key' => $key, - 'value' => $value, - 'noKey' => $noKey, - 'alterNext' => $alterNext, + 'tag' => $tag, + 'operator' => $operator, + 'key' => $key, + 'value' => $value, + 'noKey' => $noKey, + 'alterNext' => $alterNext, + 'isNthOfType' => $isNthOfType ]); } @@ -97,4 +104,9 @@ public function isAlterNext(): bool { return $this->alterNext; } + + public function isNthOfType(): bool + { + return $this->isNthOfType; + } } diff --git a/src/PHPHtmlParser/Dom/Node/InnerNode.php b/src/PHPHtmlParser/Dom/Node/InnerNode.php index e62e77f6..2b9fb305 100644 --- a/src/PHPHtmlParser/Dom/Node/InnerNode.php +++ b/src/PHPHtmlParser/Dom/Node/InnerNode.php @@ -111,6 +111,14 @@ public function childElements(): array return !$el->isTextNode(); })); } + + public function childElementsOfType(string $tag): array + { + return array_values(array_filter($this->getChildren(), function ($el) use ($tag) { + return $el instanceof HtmlNode && $el->getTag()->name() == $tag; + })); + } + /** * Adds a child node to this node and returns the id of the child for this * parent. diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 411af941..23c16d31 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -40,6 +40,7 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $value = null; $noKey = false; $alterNext = false; + $isNthOfType = false; // check for elements that alter the behavior of the next element if ($tag == '>') { @@ -71,14 +72,21 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD else if (\trim($value, ', ') == 'last-child') { $value = 'nth-last-child(1)'; } + else if (\trim($value, ', ') == 'first-of-type') { + $value = 'nth-of-type(1)'; + } + else if (\trim($value, ', ') == 'last-of-type') { + $value = 'nth-last-of-type(1)'; + } - if (preg_match("/^nth-child\(\d+\)$/", \trim($value, ', '))) { - preg_match_all("/^nth-child\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); - $key = (int) $matches[0][1]; - } else if (preg_match("/^nth-last-child\(\d+\)$/", \trim($value, ', '))) { - preg_match_all("/^nth-last-child\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); - $key = - (int) $matches[0][1]; + if (preg_match("/^(nth-child|nth-of-type)\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^(nth-child|nth-of-type)\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = (int) $matches[0][2]; + } else if (preg_match("/^(nth-last-child|nth-last-of-type)\(\d+\)$/", \trim($value, ', '))) { + preg_match_all("/^(nth-last-child|nth-last-of-type)\((\d+)\)$/", \trim($value, ', '), $matches, PREG_SET_ORDER); + $key = - (int) $matches[0][2]; } + $isNthOfType = preg_match("/^nth(-last)?-of-type\(\d+\)$/", \trim($value, ', ')); } // and final attribute selector @@ -121,7 +129,8 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $key, $value, $noKey, - $alterNext + $alterNext, + $isNthOfType ); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { $selectors[] = ParsedSelectorDTO::makeFromRules($rules); diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index b91d3961..7425a8e5 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -53,11 +53,11 @@ public function seek(array $nodes, RuleDTO $rule, array $options): array $pass = true; if ($rule->getTag() !== null && \is_numeric($rule->getKey()) && $node instanceof HtmlNode) { - if (strpos($rule->getValue(), 'nth-') === 0) { - $children = $node->childElements(); - $n = $rule->getKey() < 0 ? count($children) + $rule->getKey() : $rule->getKey()-1; - $pass = $n >= 0 && $n < count($children) && $child == $children[$n]; - } + $children = $rule->isNthOfType() ? + $node->childElementsOfType($child->getTag()->name()) : + $node->childElements(); + $n = $rule->getKey() < 0 ? count($children) + $rule->getKey() : $rule->getKey()-1; + $pass = $n >= 0 && $n < count($children) && $child == $children[$n]; } if ($pass) { diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index 18e7c7ab..84a8be86 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -62,4 +62,61 @@ public function testSeekNthChild() $this->assertCount(1, $results); $this->assertEquals('p', $results[0]->getTag()->name()); } + + public function testSeekNthOfType() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + 'div', + '=', + 1, + null, + false, + false, + true + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('div', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + 'p', + '=', + 2, + null, + false, + false, + true + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + $this->assertTrue($results[0] === $test->lastChild()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + 'p', + '=', + -1, + null, + false, + false, + true + ); + + $results = $seeker->seek([$test], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + $this->assertTrue($results[0] === $test->lastChild()); + } } From 04bf123976a63ed3b8e203b36a915e44bf9d8e1d Mon Sep 17 00:00:00 2001 From: popov654 Date: Sat, 20 Jul 2024 21:13:06 +0300 Subject: [PATCH 3/4] Implemented + and ~ operators support --- src/PHPHtmlParser/Selector/Parser.php | 2 +- src/PHPHtmlParser/Selector/Seeker.php | 16 +++++ src/PHPHtmlParser/Selector/Selector.php | 5 ++ tests/Selector/SeekerTest.php | 78 +++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 23c16d31..3f1be13b 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -43,7 +43,7 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $isNthOfType = false; // check for elements that alter the behavior of the next element - if ($tag == '>') { + if ($tag == '>' || $tag == '+' || $tag == '~') { $alterNext = true; } diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index 7425a8e5..c7bbf6b3 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -25,6 +25,22 @@ class Seeker implements SeekerInterface public function seek(array $nodes, RuleDTO $rule, array $options): array { + if ($rule->getTag() == '+' || $rule->getTag() == '~') { + $result = []; + foreach ($nodes as $node) { + if ($rule->getTag() == '+') { + $result[] = $node->nextSibling(); + } else { + while ($node->hasNextSibling()) { + $result[] = $node->nextSibling(); + $node = $node->nextSibling(); + } + } + } + + return $result; + } + $options = $this->flattenOptions($options); $return = []; diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 53c26e05..12150e8b 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -70,7 +70,11 @@ public function find(AbstractNode $node): Collection } $options = []; + $lastRule = null; foreach ($selector->getRules() as $rule) { + if ($rule->getTag() == '*' && $lastRule && ($lastRule->getTag() == '+' || $lastRule->getTag() == '~')) { + continue; + } if ($rule->isAlterNext() && $rule->getTag() == '>') { $options[] = $this->alterNext($rule); continue; @@ -78,6 +82,7 @@ public function find(AbstractNode $node): Collection $nodes = $this->seeker->seek($nodes, $rule, $options); // clear the options $options = []; + $lastRule = $rule; } // this is the final set of nodes diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index 84a8be86..e4263bb0 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -119,4 +119,82 @@ public function testSeekNthOfType() $this->assertEquals('p', $results[0]->getTag()->name()); $this->assertTrue($results[0] === $test->lastChild()); } + + public function testSeekNextOne() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '+', + '=', + null, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$p1], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('div', $results[0]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '+', + '=', + null, + null, + false, + false + ); + + $results = $seeker->seek([$div], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } + + public function testSeekNextAll() + { + $ruleDTO = RuleDTO::makeFromPrimitives( + '~', + '=', + null, + null, + false, + false + ); + + $test = new HtmlNode('div'); + $p1 = new HtmlNode('p'); + $div = new HtmlNode('div'); + $p2 = new HtmlNode('p'); + $test->addChild($p1); + $test->addChild($div); + $test->addChild($p2); + + $seeker = new Seeker(); + + $results = $seeker->seek([$p1], $ruleDTO, []); + $this->assertCount(2, $results); + $this->assertEquals('p', $results[1]->getTag()->name()); + + $ruleDTO = RuleDTO::makeFromPrimitives( + '~', + '=', + null, + null, + false, + false + ); + + $results = $seeker->seek([$div], $ruleDTO, []); + $this->assertCount(1, $results); + $this->assertEquals('p', $results[0]->getTag()->name()); + } } From 5752ee9005e66df268f491910d7612285dbf62b3 Mon Sep 17 00:00:00 2001 From: popov654 Date: Sat, 20 Jul 2024 21:31:09 +0300 Subject: [PATCH 4/4] Code cleanup --- src/PHPHtmlParser/Selector/Seeker.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index c7bbf6b3..3be94cc0 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -20,7 +20,7 @@ class Seeker implements SeekerInterface * * @var InnerNode[] * - * @throws ChildNotFoundException + * @Return AbstractNode[] */ public function seek(array $nodes, RuleDTO $rule, array $options): array {