Plugin to allow visitor contributions to WordPress posts, wiki style.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

491 lines
11KB

  1. <?php
  2. namespace Caxy\HtmlDiff;
  3. /**
  4. * Class AbstractDiff
  5. * @package Caxy\HtmlDiff
  6. */
  7. abstract class AbstractDiff
  8. {
  9. /**
  10. * @var array
  11. *
  12. * @deprecated since 0.1.0
  13. */
  14. public static $defaultSpecialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
  15. /**
  16. * @var array
  17. *
  18. * @deprecated since 0.1.0
  19. */
  20. public static $defaultSpecialCaseChars = array('.', ',', '(', ')', '\'');
  21. /**
  22. * @var bool
  23. *
  24. * @deprecated since 0.1.0
  25. */
  26. public static $defaultGroupDiffs = true;
  27. /**
  28. * @var HtmlDiffConfig
  29. */
  30. protected $config;
  31. /**
  32. * @var string
  33. */
  34. protected $content;
  35. /**
  36. * @var string
  37. */
  38. protected $oldText;
  39. /**
  40. * @var string
  41. */
  42. protected $newText;
  43. /**
  44. * @var array
  45. */
  46. protected $oldWords = array();
  47. /**
  48. * @var array
  49. */
  50. protected $newWords = array();
  51. /**
  52. * @var DiffCache[]
  53. */
  54. private $diffCaches = array();
  55. /**
  56. * AbstractDiff constructor.
  57. *
  58. * @param string $oldText
  59. * @param string $newText
  60. * @param string $encoding
  61. * @param null|array $specialCaseTags
  62. * @param null|bool $groupDiffs
  63. */
  64. public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
  65. {
  66. mb_substitute_character(0x20);
  67. $this->config = HtmlDiffConfig::create()->setEncoding($encoding);
  68. if ($specialCaseTags !== null) {
  69. $this->config->setSpecialCaseTags($specialCaseTags);
  70. }
  71. if ($groupDiffs !== null) {
  72. $this->config->setGroupDiffs($groupDiffs);
  73. }
  74. $this->oldText = $this->purifyHtml(trim($oldText));
  75. $this->newText = $this->purifyHtml(trim($newText));
  76. $this->content = '';
  77. }
  78. /**
  79. * @return bool|string
  80. */
  81. abstract public function build();
  82. /**
  83. * @return DiffCache|null
  84. */
  85. protected function getDiffCache()
  86. {
  87. if (!$this->hasDiffCache()) {
  88. return null;
  89. }
  90. $hash = spl_object_hash($this->getConfig()->getCacheProvider());
  91. if (!array_key_exists($hash, $this->diffCaches)) {
  92. $this->diffCaches[$hash] = new DiffCache($this->getConfig()->getCacheProvider());
  93. }
  94. return $this->diffCaches[$hash];
  95. }
  96. /**
  97. * @return bool
  98. */
  99. protected function hasDiffCache()
  100. {
  101. return null !== $this->getConfig()->getCacheProvider();
  102. }
  103. /**
  104. * @return HtmlDiffConfig
  105. */
  106. public function getConfig()
  107. {
  108. return $this->config;
  109. }
  110. /**
  111. * @param HtmlDiffConfig $config
  112. *
  113. * @return AbstractDiff
  114. */
  115. public function setConfig(HtmlDiffConfig $config)
  116. {
  117. $this->config = $config;
  118. return $this;
  119. }
  120. /**
  121. * @return int
  122. *
  123. * @deprecated since 0.1.0
  124. */
  125. public function getMatchThreshold()
  126. {
  127. return $this->config->getMatchThreshold();
  128. }
  129. /**
  130. * @param int $matchThreshold
  131. *
  132. * @return AbstractDiff
  133. *
  134. * @deprecated since 0.1.0
  135. */
  136. public function setMatchThreshold($matchThreshold)
  137. {
  138. $this->config->setMatchThreshold($matchThreshold);
  139. return $this;
  140. }
  141. /**
  142. * @param array $chars
  143. *
  144. * @deprecated since 0.1.0
  145. */
  146. public function setSpecialCaseChars(array $chars)
  147. {
  148. $this->config->setSpecialCaseChars($chars);
  149. }
  150. /**
  151. * @return array|null
  152. *
  153. * @deprecated since 0.1.0
  154. */
  155. public function getSpecialCaseChars()
  156. {
  157. return $this->config->getSpecialCaseChars();
  158. }
  159. /**
  160. * @param string $char
  161. *
  162. * @deprecated since 0.1.0
  163. */
  164. public function addSpecialCaseChar($char)
  165. {
  166. $this->config->addSpecialCaseChar($char);
  167. }
  168. /**
  169. * @param string $char
  170. *
  171. * @deprecated since 0.1.0
  172. */
  173. public function removeSpecialCaseChar($char)
  174. {
  175. $this->config->removeSpecialCaseChar($char);
  176. }
  177. /**
  178. * @param array $tags
  179. *
  180. * @deprecated since 0.1.0
  181. */
  182. public function setSpecialCaseTags(array $tags = array())
  183. {
  184. $this->config->setSpecialCaseChars($tags);
  185. }
  186. /**
  187. * @param string $tag
  188. *
  189. * @deprecated since 0.1.0
  190. */
  191. public function addSpecialCaseTag($tag)
  192. {
  193. $this->config->addSpecialCaseTag($tag);
  194. }
  195. /**
  196. * @param string $tag
  197. *
  198. * @deprecated since 0.1.0
  199. */
  200. public function removeSpecialCaseTag($tag)
  201. {
  202. $this->config->removeSpecialCaseTag($tag);
  203. }
  204. /**
  205. * @return array|null
  206. *
  207. * @deprecated since 0.1.0
  208. */
  209. public function getSpecialCaseTags()
  210. {
  211. return $this->config->getSpecialCaseTags();
  212. }
  213. /**
  214. * @return string
  215. */
  216. public function getOldHtml()
  217. {
  218. return $this->oldText;
  219. }
  220. /**
  221. * @return string
  222. */
  223. public function getNewHtml()
  224. {
  225. return $this->newText;
  226. }
  227. /**
  228. * @return string
  229. */
  230. public function getDifference()
  231. {
  232. return $this->content;
  233. }
  234. /**
  235. * @param bool $boolean
  236. *
  237. * @return $this
  238. *
  239. * @deprecated since 0.1.0
  240. */
  241. public function setGroupDiffs($boolean)
  242. {
  243. $this->config->setGroupDiffs($boolean);
  244. return $this;
  245. }
  246. /**
  247. * @return bool
  248. *
  249. * @deprecated since 0.1.0
  250. */
  251. public function isGroupDiffs()
  252. {
  253. return $this->config->isGroupDiffs();
  254. }
  255. /**
  256. * @param string $tag
  257. *
  258. * @return string
  259. */
  260. protected function getOpeningTag($tag)
  261. {
  262. return "/<".$tag."[^>]*/i";
  263. }
  264. /**
  265. * @param string $tag
  266. *
  267. * @return string
  268. */
  269. protected function getClosingTag($tag)
  270. {
  271. return "</".$tag.">";
  272. }
  273. /**
  274. * @param string $str
  275. * @param string $start
  276. * @param string $end
  277. *
  278. * @return string
  279. */
  280. protected function getStringBetween($str, $start, $end)
  281. {
  282. $expStr = explode( $start, $str, 2 );
  283. if ( count( $expStr ) > 1 ) {
  284. $expStr = explode( $end, $expStr[ 1 ] );
  285. if ( count( $expStr ) > 1 ) {
  286. array_pop( $expStr );
  287. return implode( $end, $expStr );
  288. }
  289. }
  290. return '';
  291. }
  292. /**
  293. * @param string $html
  294. *
  295. * @return string
  296. */
  297. protected function purifyHtml($html)
  298. {
  299. if ( class_exists( 'Tidy' ) && false ) {
  300. $config = array( 'output-xhtml' => true, 'indent' => false );
  301. $tidy = new tidy();
  302. $tidy->parseString( $html, $config, 'utf8' );
  303. $html = (string) $tidy;
  304. return $this->getStringBetween( $html, '<body>' );
  305. }
  306. return $html;
  307. }
  308. protected function splitInputsToWords()
  309. {
  310. $this->oldWords = $this->convertHtmlToListOfWords( $this->explode( $this->oldText ) );
  311. $this->newWords = $this->convertHtmlToListOfWords( $this->explode( $this->newText ) );
  312. }
  313. /**
  314. * @param string $text
  315. *
  316. * @return bool
  317. */
  318. protected function isPartOfWord($text)
  319. {
  320. return ctype_alnum(str_replace($this->config->getSpecialCaseChars(), '', $text));
  321. }
  322. /**
  323. * @param array $characterString
  324. *
  325. * @return array
  326. */
  327. protected function convertHtmlToListOfWords($characterString)
  328. {
  329. $mode = 'character';
  330. $current_word = '';
  331. $words = array();
  332. foreach ($characterString as $i => $character) {
  333. switch ($mode) {
  334. case 'character':
  335. if ( $this->isStartOfTag( $character ) ) {
  336. if ($current_word != '') {
  337. $words[] = $current_word;
  338. }
  339. $current_word = "<";
  340. $mode = 'tag';
  341. } elseif (preg_match("/\s/", $character)) {
  342. if ($current_word !== '') {
  343. $words[] = $current_word;
  344. }
  345. $current_word = preg_replace('/\s+/S', ' ', $character);
  346. $mode = 'whitespace';
  347. } else {
  348. if (
  349. (ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
  350. (in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i+1]) && $this->isPartOfWord($characterString[$i+1]))
  351. ) {
  352. $current_word .= $character;
  353. } else {
  354. $words[] = $current_word;
  355. $current_word = $character;
  356. }
  357. }
  358. break;
  359. case 'tag' :
  360. if ( $this->isEndOfTag( $character ) ) {
  361. $current_word .= ">";
  362. $words[] = $current_word;
  363. $current_word = "";
  364. if ( !preg_match('[^\s]', $character ) ) {
  365. $mode = 'whitespace';
  366. } else {
  367. $mode = 'character';
  368. }
  369. } else {
  370. $current_word .= $character;
  371. }
  372. break;
  373. case 'whitespace':
  374. if ( $this->isStartOfTag( $character ) ) {
  375. if ($current_word !== '') {
  376. $words[] = $current_word;
  377. }
  378. $current_word = "<";
  379. $mode = 'tag';
  380. } elseif ( preg_match( "/\s/", $character ) ) {
  381. $current_word .= $character;
  382. $current_word = preg_replace('/\s+/S', ' ', $current_word);
  383. } else {
  384. if ($current_word != '') {
  385. $words[] = $current_word;
  386. }
  387. $current_word = $character;
  388. $mode = 'character';
  389. }
  390. break;
  391. default:
  392. break;
  393. }
  394. }
  395. if ($current_word != '') {
  396. $words[] = $current_word;
  397. }
  398. return $words;
  399. }
  400. /**
  401. * @param string $val
  402. *
  403. * @return bool
  404. */
  405. protected function isStartOfTag($val)
  406. {
  407. return $val == "<";
  408. }
  409. /**
  410. * @param string $val
  411. *
  412. * @return bool
  413. */
  414. protected function isEndOfTag($val)
  415. {
  416. return $val == ">";
  417. }
  418. /**
  419. * @param string $value
  420. *
  421. * @return bool
  422. */
  423. protected function isWhiteSpace($value)
  424. {
  425. return !preg_match( '[^\s]', $value );
  426. }
  427. /**
  428. * @param string $value
  429. *
  430. * @return array
  431. */
  432. protected function explode($value)
  433. {
  434. // as suggested by @onassar
  435. return preg_split( '//u', $value );
  436. }
  437. }