Backported from 5.5 for 5.4 by Remi Collet


From 0da8b8b801f9276359262f1ef8274c7812d3dfda Mon Sep 17 00:00:00 2001
From: Stanislav Malyshev <stas@php.net>
Date: Sun, 15 May 2016 23:26:51 -0700
Subject: [PATCH] Fix bug #72135 - don't create strings with lengths outside
 int range

---
 ext/standard/html.c | 50 +++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/ext/standard/html.c b/ext/standard/html.c
index 72423b5..81d8aff 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -163,7 +163,7 @@ static inline unsigned int get_next_char(
 					else
 						MB_FAILURE(pos, 4);
 				}
-				
+
 				this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
 				if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
 					MB_FAILURE(pos, 4);
@@ -437,7 +437,7 @@ static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
 
 	if (charset_hint) {
 		int found = 0;
-		
+
 		/* now walk the charset map and look for the codeset */
 		for (i = 0; charset_map[i].codeset; i++) {
 			if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
@@ -545,7 +545,7 @@ static inline unsigned char unimap_bsearch(const uni_to_enc *table, unsigned cod
 		return 0;
 
 	code_key = (unsigned short) code_key_a;
-	
+
 	while (l <= h) {
 		m = l + (h - l) / 2;
 		if (code_key < m->un_code_point)
@@ -571,7 +571,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u
 		/* identity mapping of code points to unicode */
 		if (code > 0xFF) {
 			return FAILURE;
-		} 
+		}
 		*res = code;
 		break;
 
@@ -590,7 +590,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u
 			return FAILURE;
 		}
 		break;
-		
+
 	case cs_8859_15:
 		if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) {
 			*res = code;
@@ -634,7 +634,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u
 	case cs_cp866:
 		table = unimap_cp866;
 		table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866);
-		
+
 table_over_7F:
 		if (code <= 0x7F) {
 			*res = code;
@@ -710,7 +710,7 @@ static inline int unicode_cp_is_allowed(unsigned uni_cp, int document_type)
 	 * Not sure this is the relevant part for HTML 5, though. I opted to
 	 * disallow the characters that would result in a parse error when
 	 * preprocessing of the input stream. See also section 8.1.3.
-	 * 
+	 *
 	 * It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to
 	 * XHTML 1.0 the same rules as for XML 1.0.
 	 * See <http://cmsmcq.com/2007/C1.xml>.
@@ -774,7 +774,7 @@ static inline int numeric_entity_is_allowed(unsigned uni_cp, int document_type)
 /* {{{ process_numeric_entity
  * Auxiliary function to traverse_for_entities.
  * On input, *buf should point to the first character after # and on output, it's the last
- * byte read, no matter if there was success or insuccess. 
+ * byte read, no matter if there was success or insuccess.
  */
 static inline int process_numeric_entity(const char **buf, unsigned *code_point)
 {
@@ -784,7 +784,7 @@ static inline int process_numeric_entity(const char **buf, unsigned *code_point)
 
 	if (hexadecimal && (**buf != '\0'))
 		(*buf)++;
-			
+
 	/* strtol allows whitespace and other stuff in the beginning
 		* we're not interested */
 	if ((hexadecimal && !isxdigit(**buf)) ||
@@ -969,7 +969,7 @@ static void traverse_for_entities(
 				goto invalid_code;
 
 			/* are we allowed to decode this entity in this document type?
-			 * HTML 5 is the only that has a character that cannot be used in 
+			 * HTML 5 is the only that has a character that cannot be used in
 			 * a numeric entity but is allowed literally (U+000D). The
 			 * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
 			if (!unicode_cp_is_allowed(code, doctype) ||
@@ -996,9 +996,9 @@ static void traverse_for_entities(
 				}
 			}
 		}
-		
+
 		assert(*next == ';');
-		
+
 		if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
 				(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
 				/* && code2 == '\0' always true for current maps */)
@@ -1026,7 +1026,7 @@ static void traverse_for_entities(
 			*(q++) = *p;
 		}
 	}
-	
+
 	*q = '\0';
 	*retlen = (size_t)(q - ret);
 }
@@ -1066,7 +1066,7 @@ static entity_table_opt determine_entity_table(int all, int doctype)
 	entity_table_opt retval = {NULL};
 
 	assert(!(doctype == ENT_HTML_DOC_XML1 && all));
-	
+
 	if (all) {
 		retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ?
 			entity_ms_table_html5 : entity_ms_table_html4;
@@ -1111,13 +1111,13 @@ PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_
 	if (retlen == 0) {
 		goto empty_source;
 	}
-	
+
 	inverse_map = unescape_inverse_map(all, flags);
-	
+
 	/* replace numeric entities */
 	traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset);
 
-empty_source:	
+empty_source:
 	*newlen = retlen;
 	return ret;
 }
@@ -1141,7 +1141,7 @@ static inline void find_entity_for_char(
 {
 	unsigned stage1_idx = ENT_STAGE1_INDEX(k);
 	const entity_stage3_row *c;
-	
+
 	if (stage1_idx > 0x1D) {
 		*entity     = NULL;
 		*entity_len = 0;
@@ -1162,7 +1162,7 @@ static inline void find_entity_for_char(
 		if (!(*cursor < oldlen))
 			goto no_suitable_2nd;
 
-		next_char = get_next_char(charset, old, oldlen, cursor, &status); 
+		next_char = get_next_char(charset, old, oldlen, cursor, &status);
 
 		if (status == FAILURE)
 			goto no_suitable_2nd;
@@ -1187,7 +1187,7 @@ static inline void find_entity_for_char(
 		*entity = (const unsigned char *)
 			c->data.multicodepoint_table[0].leading_entry.default_entity;
 		*entity_len = c->data.multicodepoint_table[0].leading_entry.default_entity_len;
-	}	
+	}
 }
 /* }}} */
 
@@ -1255,7 +1255,7 @@ PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size
 
 	/* initial estimate */
 	if (oldlen < 64) {
-		maxlen = 128;	
+		maxlen = 128;
 	} else {
 		maxlen = 2 * oldlen;
 		if (maxlen < oldlen) {
@@ -1444,6 +1444,10 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
 	}
 
 	replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);
+	if (new_len > INT_MAX) {
+		efree(replaced);
+		RETURN_FALSE;
+	}
 	RETVAL_STRINGL(replaced, (int)new_len, 0);
 }
 /* }}} */
@@ -1577,7 +1581,7 @@ static inline void write_s3row_data(
 			} else {
 				spe_cp = uni_cp;
 			}
-			
+
 			written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp);
 			memcpy(&entity[1], mcpr[i].normal_entry.entity, l);
 			entity[l + 1] = ';';
@@ -1615,7 +1619,7 @@ PHP_FUNCTION(get_html_translation_table)
 	LIMIT_ALL(all, doctype, charset);
 
 	array_init(return_value);
-	
+
 	entity_table = determine_entity_table(all, doctype);
 	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
 		to_uni_table = enc_to_uni_index[charset];


From e9559131152ab0fa89737db11ebe8f43e1435b96 Mon Sep 17 00:00:00 2001
From: Stanislav Malyshev <stas@php.net>
Date: Tue, 24 May 2016 15:52:15 -0700
Subject: [PATCH] Better fix for bug #72135

---
 ext/standard/html.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ext/standard/html.c b/ext/standard/html.c
index 81d8aff..c5fd4b8 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -1423,6 +1423,11 @@ PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size
 	}
 	replaced[len] = '\0';
 	*newlen = len;
+	if(len > INT_MAX) {
+		zend_error_noreturn(E_ERROR, "Escaped string is too long");
+		efree(replaced);
+		return NULL;
+	}
 
 	return replaced;
 }
@@ -1444,10 +1449,6 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
 	}
 
 	replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);
-	if (new_len > INT_MAX) {
-		efree(replaced);
-		RETURN_FALSE;
-	}
 	RETVAL_STRINGL(replaced, (int)new_len, 0);
 }
 /* }}} */