Initial commit

Spotme · Nov 11, 2012 · 5f5d729 · 5f5d729
commit 5f5d729
Show file tree

Hide file tree

Showing 88 changed files with 30,179 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,57 @@
+
+CC?=gcc
+#CFLAGS=-W  -Wall -g -O0
+CFLAGS?= -Os -DNDEBUG -s
+
+DESTDIR?= /usr
+
+STEMMERS?= danish dutch english finnish french german hungarian \
+  italian norwegian porter portuguese romanian russian \
+  spanish swedish
+
+CFLAGS+= \
+  -DSQLITE_ENABLE_FTS4 \
+  -DSQLITE_ENABLE_FTS4_UNICODE61
+
+SOURCES= \
+  fts3_unicode2.c \
+  fts3_unicodesn.c \
+  extension.c
+
+HEADERS=	fts3_tokenizer.h
+
+INCLUDES= \
+  -Ilibstemmer_c/runtime \
+  -Ilibstemmer_c/src_c
+
+LIBRARIES=	-lsqlite3
+
+SNOWBALL_SOURCES= \
+  libstemmer_c/runtime/api_sq3.c \
+  libstemmer_c/runtime/utilities_sq3.c
+
+SNOWBALL_HEADERS= \
+  libstemmer_c/include/libstemmer.h \
+  libstemmer_c/runtime/api.h \
+  libstemmer_c/runtime/header.h
+
+SNOWBALL_SOURCES+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).c)
+
+SNOWBALL_HEADERS+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).h)
+
+SNOWBALL_FLAGS+= $(foreach s, $(STEMMERS), -DWITH_STEMMER_$(s))
+
+all: unicodesn.sqlext
+
+unicodesn.sqlext: $(HEADERS) $(SOURCES) $(SNOWBALL_HEADERS) $(SNOWBALL_SOURCES)
+	$(CC) $(CFLAGS) $(SNOWBALL_FLAGS) $(INCLUDES) -fPIC -shared -fvisibility=hidden -o $@ \
+	   $(SOURCES) $(SNOWBALL_SOURCES) $(LIBRARIES)
+
+clean:
+	rm -f *.o unicodesn.sqlext
+
+install: unicodesn.sqlext
+	mkdir -p ${DESTDIR}/lib 2> /dev/null
+	install -D -o root -g root -m 644 unicodesn.sqlext ${DESTDIR}/lib
+
+.PHONY: clean install
diff --git a/README b/README
@@ -0,0 +1,33 @@
+SQLite3-unicodesn
+==============
+
+SQLite "unicode" full-text-search tokenizer with Snowball stemming
+
+Installation
+============
+
+   $ git clone git://github.com/littlesavage/sqlite3-unicodesn.git
+   $ cd sqlite3-unicodesn
+   $ make
+   $ su
+   # make install
+
+Usage
+======
+
+    $ sqlite3
+    sqlite> .load unicodesn.sqlext
+    sqlite> CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian");
+    sqlite> INSERT INTO fts VALUES ("Пионэры! Идите в жопу!");
+    sqlite> SELECT * FROM fts WHERE text MATCH 'Жопа';
+    Пионэры! Идите в жопу!
+
+License
+=======
+
+Snowball files and stemmers are covered by the BSD license.
+
+SQLite is in the Public Domain.
+
+SQLite3-unicodesn code is in the Public Domain.
+
diff --git a/extension.c b/extension.c
@@ -0,0 +1,68 @@
+/*
+** 2012 November 11
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+*/
+#include <sqlite3.h>
+#include <sqlite3ext.h>
+
+#include "fts3_unicodesn.h"
+
+SQLITE_EXTENSION_INIT1
+
+/*
+** Register a tokenizer implementation with FTS3 or FTS4.
+*/
+static int registerTokenizer(
+  sqlite3 *db,
+  char *zName,
+  const sqlite3_tokenizer_module *p
+){
+  int rc;
+  sqlite3_stmt *pStmt;
+  const char *zSql = "SELECT fts3_tokenizer(?, ?)";
+
+  rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
+  if( rc!=SQLITE_OK ){
+    return rc;
+  }
+
+  sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
+  sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
+  sqlite3_step(pStmt);
+
+  return sqlite3_finalize(pStmt);
+}
+
+/* SQLite invokes this routine once when it loads the extension.
+** Create new functions, collating sequences, and virtual table
+** modules here.  This is usually the only exported symbol in
+** the shared library.
+*/
+int sqlite3_extension_init(
+      sqlite3 *db,          /* The database connection */
+      char **pzErrMsg,      /* Write error messages here */
+      const sqlite3_api_routines *pApi  /* API methods */
+      )
+{
+   const sqlite3_tokenizer_module *tokenizer;
+
+   SQLITE_EXTENSION_INIT2(pApi)
+
+   sqlite3Fts3UnicodeSnTokenizer(&tokenizer);
+
+   registerTokenizer(db, TOKENIZER_NAME, tokenizer);
+
+   return 0;
+}
+
+
+
diff --git a/fts3Int.h b/fts3Int.h
@@ -0,0 +1,35 @@
+/*
+** 2009 Nov 12
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+*/
+#ifndef _FTSINT_H
+#define _FTSINT_H
+
+#include "sqlite3.h"
+#include "fts3_tokenizer.h"
+
+typedef unsigned char u8;         /* 1-byte (or larger) unsigned integer */
+typedef short int i16;            /* 2-byte (or larger) signed integer */
+typedef unsigned int u32;         /* 4-byte unsigned integer */
+typedef sqlite3_uint64 u64;       /* 8-byte unsigned integer */
+typedef sqlite3_int64 i64;        /* 8-byte signed integer */
+
+#define UNUSED_PARAMETER(x) (void)(x)
+
+/* fts3_unicode2.c (functions generated by parsing unicode text files) */
+#ifdef SQLITE_ENABLE_FTS4_UNICODE61
+int sqlite3FtsUnicodeFold(int, int);
+int sqlite3FtsUnicodeIsalnum(int);
+int sqlite3FtsUnicodeIsdiacritic(int);
+#endif
+
+#endif /* _FTSINT_H */
diff --git a/fts3_tokenizer.h b/fts3_tokenizer.h
@@ -0,0 +1,161 @@
+/*
+** 2006 July 10
+**
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Defines the interface to tokenizers used by fulltext-search.  There
+** are three basic components:
+**
+** sqlite3_tokenizer_module is a singleton defining the tokenizer
+** interface functions.  This is essentially the class structure for
+** tokenizers.
+**
+** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
+** including customization information defined at creation time.
+**
+** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
+** tokens from a particular input.
+*/
+#ifndef _FTS3_TOKENIZER_H_
+#define _FTS3_TOKENIZER_H_
+
+/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
+** If tokenizers are to be allowed to call sqlite3_*() functions, then
+** we will need a way to register the API consistently.
+*/
+#include "sqlite3.h"
+
+/*
+** Structures used by the tokenizer interface. When a new tokenizer
+** implementation is registered, the caller provides a pointer to
+** an sqlite3_tokenizer_module containing pointers to the callback
+** functions that make up an implementation.
+**
+** When an fts3 table is created, it passes any arguments passed to
+** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
+** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
+** implementation. The xCreate() function in turn returns an 
+** sqlite3_tokenizer structure representing the specific tokenizer to
+** be used for the fts3 table (customized by the tokenizer clause arguments).
+**
+** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
+** method is called. It returns an sqlite3_tokenizer_cursor object
+** that may be used to tokenize a specific input buffer based on
+** the tokenization rules supplied by a specific sqlite3_tokenizer
+** object.
+*/
+typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
+typedef struct sqlite3_tokenizer sqlite3_tokenizer;
+typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
+
+struct sqlite3_tokenizer_module {
+
+  /*
+  ** Structure version. Should always be set to 0 or 1.
+  */
+  int iVersion;
+
+  /*
+  ** Create a new tokenizer. The values in the argv[] array are the
+  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
+  ** TABLE statement that created the fts3 table. For example, if
+  ** the following SQL is executed:
+  **
+  **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
+  **
+  ** then argc is set to 2, and the argv[] array contains pointers
+  ** to the strings "arg1" and "arg2".
+  **
+  ** This method should return either SQLITE_OK (0), or an SQLite error 
+  ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
+  ** to point at the newly created tokenizer structure. The generic
+  ** sqlite3_tokenizer.pModule variable should not be initialised by
+  ** this callback. The caller will do so.
+  */
+  int (*xCreate)(
+    int argc,                           /* Size of argv array */
+    const char *const*argv,             /* Tokenizer argument strings */
+    sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
+  );
+
+  /*
+  ** Destroy an existing tokenizer. The fts3 module calls this method
+  ** exactly once for each successful call to xCreate().
+  */
+  int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
+
+  /*
+  ** Create a tokenizer cursor to tokenize an input buffer. The caller
+  ** is responsible for ensuring that the input buffer remains valid
+  ** until the cursor is closed (using the xClose() method). 
+  */
+  int (*xOpen)(
+    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
+    const char *pInput, int nBytes,      /* Input buffer */
+    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
+  );
+
+  /*
+  ** Destroy an existing tokenizer cursor. The fts3 module calls this 
+  ** method exactly once for each successful call to xOpen().
+  */
+  int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
+
+  /*
+  ** Retrieve the next token from the tokenizer cursor pCursor. This
+  ** method should either return SQLITE_OK and set the values of the
+  ** "OUT" variables identified below, or SQLITE_DONE to indicate that
+  ** the end of the buffer has been reached, or an SQLite error code.
+  **
+  ** *ppToken should be set to point at a buffer containing the 
+  ** normalized version of the token (i.e. after any case-folding and/or
+  ** stemming has been performed). *pnBytes should be set to the length
+  ** of this buffer in bytes. The input text that generated the token is
+  ** identified by the byte offsets returned in *piStartOffset and
+  ** *piEndOffset. *piStartOffset should be set to the index of the first
+  ** byte of the token in the input buffer. *piEndOffset should be set
+  ** to the index of the first byte just past the end of the token in
+  ** the input buffer.
+  **
+  ** The buffer *ppToken is set to point at is managed by the tokenizer
+  ** implementation. It is only required to be valid until the next call
+  ** to xNext() or xClose(). 
+  */
+  /* TODO(shess) current implementation requires pInput to be
+  ** nul-terminated.  This should either be fixed, or pInput/nBytes
+  ** should be converted to zInput.
+  */
+  int (*xNext)(
+    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
+    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
+    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
+    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
+    int *piPosition      /* OUT: Number of tokens returned before this one */
+  );
+
+  /***********************************************************************
+  ** Methods below this point are only available if iVersion>=1.
+  */
+
+  /* 
+  ** Configure the language id of a tokenizer cursor.
+  */
+  int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
+};
+
+struct sqlite3_tokenizer {
+  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
+  /* Tokenizer implementations will typically add additional fields */
+};
+
+struct sqlite3_tokenizer_cursor {
+  sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
+  /* Tokenizer implementations will typically add additional fields */
+};
+
+int fts3_global_term_cnt(int iTerm, int iCol);
+int fts3_term_cnt(int iTerm, int iCol);
+
+
+#endif /* _FTS3_TOKENIZER_H_ */