AI2 Component  (Version nb184)
HtmlEntities.java
Go to the documentation of this file.
1 // -*- mode: java; c-basic-offset: 2; -*-
2 // Copyright 2009-2011 Google, All Rights reserved
3 // Copyright 2011-2012 MIT, All rights reserved
4 // Released under the Apache License, Version 2.0
5 // http://www.apache.org/licenses/LICENSE-2.0
6 
7 package com.google.appinventor.components.common;
8 
9 import java.util.HashMap;
10 import java.util.Map;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13 
22 public class HtmlEntities {
23 
24  private static final Pattern HTML_ENTITY_PATTERN = Pattern.compile("&(#?[0-9a-zA-Z]+);");
25  private static final Map<String, Character> lookup = new HashMap<String, Character>();
26 
27  static {
28  // Letter entities
29  lookup.put("Agrave", (char) 192);
30  lookup.put("agrave", (char) 224);
31  lookup.put("Aacute", (char) 193);
32  lookup.put("aacute", (char) 225);
33  lookup.put("Acirc", (char) 194);
34  lookup.put("acirc", (char) 226);
35  lookup.put("Atilde", (char) 195);
36  lookup.put("atilde", (char) 227);
37  lookup.put("Auml", (char) 196);
38  lookup.put("auml", (char) 228);
39  lookup.put("Aring", (char) 197);
40  lookup.put("aring", (char) 229);
41  lookup.put("AElig", (char) 198);
42  lookup.put("aelig", (char) 230);
43  lookup.put("Ccedil", (char) 199);
44  lookup.put("ccedil", (char) 231);
45  lookup.put("Egrave", (char) 200);
46  lookup.put("egrave", (char) 232);
47  lookup.put("Eacute", (char) 201);
48  lookup.put("eacute", (char) 233);
49  lookup.put("Ecirc", (char) 202);
50  lookup.put("ecirc", (char) 234);
51  lookup.put("Euml", (char) 203);
52  lookup.put("euml", (char) 235);
53  lookup.put("Igrave", (char) 204);
54  lookup.put("igrave", (char) 236);
55  lookup.put("Iacute", (char) 205);
56  lookup.put("iacute", (char) 237);
57  lookup.put("Icirc", (char) 206);
58  lookup.put("icirc", (char) 238);
59  lookup.put("Iuml", (char) 207);
60  lookup.put("iuml", (char) 239);
61  lookup.put("ETH", (char) 208);
62  lookup.put("eth", (char) 240);
63  lookup.put("Ntilde", (char) 209);
64  lookup.put("ntilde", (char) 241);
65  lookup.put("Ograve", (char) 210);
66  lookup.put("ograve", (char) 242);
67  lookup.put("Oacute", (char) 211);
68  lookup.put("oacute", (char) 243);
69  lookup.put("Ocirc", (char) 212);
70  lookup.put("ocirc", (char) 244);
71  lookup.put("Otilde", (char) 213);
72  lookup.put("otilde", (char) 245);
73  lookup.put("Ouml", (char) 214);
74  lookup.put("ouml", (char) 246);
75  lookup.put("Oslash", (char) 216);
76  lookup.put("oslash", (char) 248);
77  lookup.put("Ugrave", (char) 217);
78  lookup.put("ugrave", (char) 249);
79  lookup.put("Uacute", (char) 218);
80  lookup.put("uacute", (char) 250);
81  lookup.put("Ucirc", (char) 219);
82  lookup.put("ucirc", (char) 251);
83  lookup.put("Uuml", (char) 220);
84  lookup.put("uuml", (char) 252);
85  lookup.put("Yacute", (char) 221);
86  lookup.put("yacute", (char) 253);
87  lookup.put("THORN", (char) 222);
88  lookup.put("thorn", (char) 254);
89  lookup.put("szlig", (char) 223);
90  lookup.put("yuml", (char) 255);
91  lookup.put("Yuml", (char) 376);
92  lookup.put("OElig", (char) 338);
93  lookup.put("oelig", (char) 339);
94  lookup.put("Scaron", (char) 352);
95  lookup.put("scaron", (char) 353);
96  lookup.put("Alpha", (char) 913);
97  lookup.put("Beta", (char) 914);
98  lookup.put("Gamma", (char) 915);
99  lookup.put("Delta", (char) 916);
100  lookup.put("Epsilon", (char) 917);
101  lookup.put("Zeta", (char) 918);
102  lookup.put("Eta", (char) 919);
103  lookup.put("Theta", (char) 920);
104  lookup.put("Iota", (char) 921);
105  lookup.put("Kappa", (char) 922);
106  lookup.put("Lambda", (char) 923);
107  lookup.put("Mu", (char) 924);
108  lookup.put("Nu", (char) 925);
109  lookup.put("Xi", (char) 926);
110  lookup.put("Omicron", (char) 927);
111  lookup.put("Pi", (char) 928);
112  lookup.put("Rho", (char) 929);
113  lookup.put("Sigma", (char) 931);
114  lookup.put("Tau", (char) 932);
115  lookup.put("Upsilon", (char) 933);
116  lookup.put("Phi", (char) 934);
117  lookup.put("Chi", (char) 935);
118  lookup.put("Psi", (char) 936);
119  lookup.put("Omega", (char) 937);
120  lookup.put("alpha", (char) 945);
121  lookup.put("beta", (char) 946);
122  lookup.put("gamma", (char) 947);
123  lookup.put("delta", (char) 948);
124  lookup.put("epsilon", (char) 949);
125  lookup.put("zeta", (char) 950);
126  lookup.put("eta", (char) 951);
127  lookup.put("theta", (char) 952);
128  lookup.put("iota", (char) 953);
129  lookup.put("kappa", (char) 954);
130  lookup.put("lambda", (char) 955);
131  lookup.put("mu", (char) 956);
132  lookup.put("nu", (char) 957);
133  lookup.put("xi", (char) 958);
134  lookup.put("omicron", (char) 959);
135  lookup.put("pi", (char) 960);
136  lookup.put("rho", (char) 961);
137  lookup.put("sigmaf", (char) 962);
138  lookup.put("sigma", (char) 963);
139  lookup.put("tau", (char) 964);
140  lookup.put("upsilon", (char) 965);
141  lookup.put("phi", (char) 966);
142  lookup.put("chi", (char) 967);
143  lookup.put("psi", (char) 968);
144  lookup.put("omega", (char) 969);
145  lookup.put("thetasym", (char) 977);
146  lookup.put("upsih", (char) 978);
147  lookup.put("piv", (char) 982);
148  // Non-letter entities
149  lookup.put("iexcl", (char) 161);
150  lookup.put("cent", (char) 162);
151  lookup.put("pound", (char) 163);
152  lookup.put("curren", (char) 164);
153  lookup.put("yen", (char) 165);
154  lookup.put("brvbar", (char) 166);
155  lookup.put("sect", (char) 167);
156  lookup.put("uml", (char) 168);
157  lookup.put("copy", (char) 169);
158  lookup.put("ordf", (char) 170);
159  lookup.put("laquo", (char) 171);
160  lookup.put("not", (char) 172);
161  lookup.put("shy", (char) 173);
162  lookup.put("reg", (char) 174);
163  lookup.put("macr", (char) 175);
164  lookup.put("deg", (char) 176);
165  lookup.put("plusmn", (char) 177);
166  lookup.put("sup2", (char) 178);
167  lookup.put("sup3", (char) 179);
168  lookup.put("acute", (char) 180);
169  lookup.put("micro", (char) 181);
170  lookup.put("para", (char) 182);
171  lookup.put("middot", (char) 183);
172  lookup.put("cedil", (char) 184);
173  lookup.put("sup1", (char) 185);
174  lookup.put("ordm", (char) 186);
175  lookup.put("raquo", (char) 187);
176  lookup.put("frac14", (char) 188);
177  lookup.put("frac12", (char) 189);
178  lookup.put("frac34", (char) 190);
179  lookup.put("iquest", (char) 191);
180  lookup.put("times", (char) 215);
181  lookup.put("divide", (char) 247);
182  lookup.put("fnof", (char) 402);
183  lookup.put("circ", (char) 710);
184  lookup.put("tilde", (char) 732);
185  lookup.put("lrm", (char) 8206);
186  lookup.put("rlm", (char) 8207);
187  lookup.put("ndash", (char) 8211);
188  lookup.put("endash", (char) 8211);
189  lookup.put("mdash", (char) 8212);
190  lookup.put("emdash", (char) 8212);
191  lookup.put("lsquo", (char) 8216);
192  lookup.put("rsquo", (char) 8217);
193  lookup.put("sbquo", (char) 8218);
194  lookup.put("ldquo", (char) 8220);
195  lookup.put("rdquo", (char) 8221);
196  lookup.put("bdquo", (char) 8222);
197  lookup.put("dagger", (char) 8224);
198  lookup.put("Dagger", (char) 8225);
199  lookup.put("bull", (char) 8226);
200  lookup.put("hellip", (char) 8230);
201  lookup.put("permil", (char) 8240);
202  lookup.put("prime", (char) 8242);
203  lookup.put("Prime", (char) 8243);
204  lookup.put("lsaquo", (char) 8249);
205  lookup.put("rsaquo", (char) 8250);
206  lookup.put("oline", (char) 8254);
207  lookup.put("frasl", (char) 8260);
208  lookup.put("euro", (char) 8364);
209  lookup.put("image", (char) 8465);
210  lookup.put("weierp", (char) 8472);
211  lookup.put("real", (char) 8476);
212  lookup.put("trade", (char) 8482);
213  lookup.put("alefsym", (char) 8501);
214  lookup.put("larr", (char) 8592);
215  lookup.put("uarr", (char) 8593);
216  lookup.put("rarr", (char) 8594);
217  lookup.put("darr", (char) 8595);
218  lookup.put("harr", (char) 8596);
219  lookup.put("crarr", (char) 8629);
220  lookup.put("lArr", (char) 8656);
221  lookup.put("uArr", (char) 8657);
222  lookup.put("rArr", (char) 8658);
223  lookup.put("dArr", (char) 8659);
224  lookup.put("hArr", (char) 8660);
225  lookup.put("forall", (char) 8704);
226  lookup.put("part", (char) 8706);
227  lookup.put("exist", (char) 8707);
228  lookup.put("empty", (char) 8709);
229  lookup.put("nabla", (char) 8711);
230  lookup.put("isin", (char) 8712);
231  lookup.put("notin", (char) 8713);
232  lookup.put("ni", (char) 8715);
233  lookup.put("prod", (char) 8719);
234  lookup.put("sum", (char) 8721);
235  lookup.put("minus", (char) 8722);
236  lookup.put("lowast", (char) 8727);
237  lookup.put("radic", (char) 8730);
238  lookup.put("prop", (char) 8733);
239  lookup.put("infin", (char) 8734);
240  lookup.put("ang", (char) 8736);
241  lookup.put("and", (char) 8743);
242  lookup.put("or", (char) 8744);
243  lookup.put("cap", (char) 8745);
244  lookup.put("cup", (char) 8746);
245  lookup.put("int", (char) 8747);
246  lookup.put("there4", (char) 8756);
247  lookup.put("sim", (char) 8764);
248  lookup.put("cong", (char) 8773);
249  lookup.put("asymp", (char) 8776);
250  lookup.put("ne", (char) 8800);
251  lookup.put("equiv", (char) 8801);
252  lookup.put("le", (char) 8804);
253  lookup.put("ge", (char) 8805);
254  lookup.put("sub", (char) 8834);
255  lookup.put("sup", (char) 8835);
256  lookup.put("nsub", (char) 8836);
257  lookup.put("sube", (char) 8838);
258  lookup.put("supe", (char) 8839);
259  lookup.put("oplus", (char) 8853);
260  lookup.put("otimes", (char) 8855);
261  lookup.put("perp", (char) 8869);
262  lookup.put("sdot", (char) 8901);
263  lookup.put("lceil", (char) 8968);
264  lookup.put("rceil", (char) 8969);
265  lookup.put("lfloor", (char) 8970);
266  lookup.put("rfloor", (char) 8971);
267  lookup.put("lang", (char) 9001);
268  lookup.put("rang", (char) 9002);
269  lookup.put("loz", (char) 9674);
270  lookup.put("spades", (char) 9824);
271  lookup.put("clubs", (char) 9827);
272  lookup.put("hearts", (char) 9829);
273  lookup.put("diams", (char) 9830);
274  // "Special" entities
275  lookup.put("gt", (char) 62);
276  lookup.put("GT", (char) 62);
277  lookup.put("lt", (char) 60);
278  lookup.put("LT", (char) 60);
279  lookup.put("quot", (char) 34);
280  lookup.put("QUOT", (char) 34);
281  lookup.put("amp", (char) 38);
282  lookup.put("AMP", (char) 38);
283  lookup.put("apos", (char) 39);
284  // "Whitespace" entities
285  lookup.put("nbsp", (char) 160);
286  lookup.put("ensp", (char) 8194);
287  lookup.put("emsp", (char) 8195);
288  lookup.put("thinsp", (char) 8201);
289  lookup.put("zwnj", (char) 8204);
290  // "Ignore" entities
291  lookup.put("zwj", (char) 8205);
292  }
293 
302  public static Character toCharacter(String entityName) {
303  return lookup.get(entityName);
304  }
305 
319  public static String decodeHtmlText(String htmlText) {
320  if (htmlText.length() == 0 ||
321  htmlText.indexOf('&') == -1) {
322  return htmlText;
323  }
324 
325  StringBuilder output = new StringBuilder();
326  int lastMatchEnd = 0;
327  Matcher matcher = HTML_ENTITY_PATTERN.matcher(htmlText);
328  while (matcher.find()) {
329  // Remove the beginning ampersand and ending semicolon from the entity.
330  String entity = matcher.group(1);
331 
332  Character convertedEntity = null;
333  if (entity.startsWith("#x")) {
334  // The complete entity was of the form "&#xhhhh;", where hhhh is hex.
335  String hhhh = entity.substring(2);
336  try {
337  System.out.println("hex number is " + hhhh);
338  int code = Integer.parseInt(hhhh, 16);
339  convertedEntity = Character.valueOf((char) code);
340  } catch (NumberFormatException e) {
341  // convertedEntity is still null
342  }
343  } else if (entity.startsWith("#")) {
344  // The complete entity was of the form "&#nnnn;", where nnnn is decimal.
345  String nnnn = entity.substring(1);
346  try {
347  int code = Integer.parseInt(nnnn);
348  convertedEntity = Character.valueOf((char) code);
349  } catch (NumberFormatException e) {
350  // convertedEntity is still null
351  }
352  } else {
353  convertedEntity = lookup.get(entity);
354  }
355 
356  if (convertedEntity != null) {
357  output.append(htmlText.substring(lastMatchEnd, matcher.start()));
358  output.append(convertedEntity);
359  lastMatchEnd = matcher.end();
360  }
361  }
362  if (lastMatchEnd < htmlText.length()) {
363  output.append(htmlText.substring(lastMatchEnd));
364  }
365  return output.toString();
366  }
367 }
com.google.appinventor.components.common.HtmlEntities
Definition: HtmlEntities.java:22
com.google.appinventor.components.common.HtmlEntities.toCharacter
static Character toCharacter(String entityName)
Definition: HtmlEntities.java:302
com.google.appinventor.components.common.HtmlEntities.decodeHtmlText
static String decodeHtmlText(String htmlText)
Definition: HtmlEntities.java:319