Rename and slightly redefine the default text search parser's "word"

categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-10-23 20:46:12 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-10-23 20:46:12 +0000
commit: dbaec70c153239224c0288d865b96c2f939fbdf5 (patch)
tree: a2309acc315e5d4b9f9b0cd8b2ad60dc999ba93d /src/test/regress/sql/tsdicts.sql
parent: 344d0cae64dbf398559b855806fc7338ec0a2e64 (diff)
download: postgresql-dbaec70c153239224c0288d865b96c2f939fbdf5.tar.gz
1 files changed, 5 insertions, 6 deletions
diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql
index 2e6cf791d8..f36e63a311 100644
--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
 SELECT ts_lexize('synonym', 'Gogle');
 
 -- Create and simple test thesaurus dictionary
--- More test in configuration checks because of ts_lexize
--- can not give more tat one word as it may wish thesaurus.
+-- More tests in configuration checks because ts_lexize()
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                         Template=thesaurus,
 						DictFile=thesaurus_sample, 
@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 );
 
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
 SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-10-23 20:46:12 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-10-23 20:46:12 +0000
commit	dbaec70c153239224c0288d865b96c2f939fbdf5 (patch)
tree	a2309acc315e5d4b9f9b0cd8b2ad60dc999ba93d /src/test/regress/sql/tsdicts.sql
parent	344d0cae64dbf398559b855806fc7338ec0a2e64 (diff)
download	postgresql-dbaec70c153239224c0288d865b96c2f939fbdf5.tar.gz