Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
illarionov committed Nov 11, 2012
0 parents commit 5f5d729
Show file tree
Hide file tree
Showing 88 changed files with 30,179 additions and 0 deletions.
57 changes: 57 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@

CC?=gcc
#CFLAGS=-W -Wall -g -O0
CFLAGS?= -Os -DNDEBUG -s

DESTDIR?= /usr

STEMMERS?= danish dutch english finnish french german hungarian \
italian norwegian porter portuguese romanian russian \
spanish swedish

CFLAGS+= \
-DSQLITE_ENABLE_FTS4 \
-DSQLITE_ENABLE_FTS4_UNICODE61

SOURCES= \
fts3_unicode2.c \
fts3_unicodesn.c \
extension.c

HEADERS= fts3_tokenizer.h

INCLUDES= \
-Ilibstemmer_c/runtime \
-Ilibstemmer_c/src_c

LIBRARIES= -lsqlite3

SNOWBALL_SOURCES= \
libstemmer_c/runtime/api_sq3.c \
libstemmer_c/runtime/utilities_sq3.c

SNOWBALL_HEADERS= \
libstemmer_c/include/libstemmer.h \
libstemmer_c/runtime/api.h \
libstemmer_c/runtime/header.h

SNOWBALL_SOURCES+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).c)

SNOWBALL_HEADERS+= $(foreach s, $(STEMMERS), libstemmer_c/src_c/stem_UTF_8_$(s).h)

SNOWBALL_FLAGS+= $(foreach s, $(STEMMERS), -DWITH_STEMMER_$(s))

all: unicodesn.sqlext

unicodesn.sqlext: $(HEADERS) $(SOURCES) $(SNOWBALL_HEADERS) $(SNOWBALL_SOURCES)
$(CC) $(CFLAGS) $(SNOWBALL_FLAGS) $(INCLUDES) -fPIC -shared -fvisibility=hidden -o $@ \
$(SOURCES) $(SNOWBALL_SOURCES) $(LIBRARIES)

clean:
rm -f *.o unicodesn.sqlext

install: unicodesn.sqlext
mkdir -p ${DESTDIR}/lib 2> /dev/null
install -D -o root -g root -m 644 unicodesn.sqlext ${DESTDIR}/lib

.PHONY: clean install
33 changes: 33 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
SQLite3-unicodesn
==============

SQLite "unicode" full-text-search tokenizer with Snowball stemming

Installation
============

$ git clone git://github.com/littlesavage/sqlite3-unicodesn.git
$ cd sqlite3-unicodesn
$ make
$ su
# make install

Usage
======

$ sqlite3
sqlite> .load unicodesn.sqlext
sqlite> CREATE VIRTUAL TABLE fts USING fts3(text, tokenize=unicodesn "stemmer=russian");
sqlite> INSERT INTO fts VALUES ("Пионэры! Идите в жопу!");
sqlite> SELECT * FROM fts WHERE text MATCH 'Жопа';
Пионэры! Идите в жопу!

License
=======

Snowball files and stemmers are covered by the BSD license.

SQLite is in the Public Domain.

SQLite3-unicodesn code is in the Public Domain.

68 changes: 68 additions & 0 deletions extension.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
** 2012 November 11
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
*/
#include <sqlite3.h>
#include <sqlite3ext.h>

#include "fts3_unicodesn.h"

SQLITE_EXTENSION_INIT1

/*
** Register a tokenizer implementation with FTS3 or FTS4.
*/
static int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
int rc;
sqlite3_stmt *pStmt;
const char *zSql = "SELECT fts3_tokenizer(?, ?)";

rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}

sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);

return sqlite3_finalize(pStmt);
}

/* SQLite invokes this routine once when it loads the extension.
** Create new functions, collating sequences, and virtual table
** modules here. This is usually the only exported symbol in
** the shared library.
*/
int sqlite3_extension_init(
sqlite3 *db, /* The database connection */
char **pzErrMsg, /* Write error messages here */
const sqlite3_api_routines *pApi /* API methods */
)
{
const sqlite3_tokenizer_module *tokenizer;

SQLITE_EXTENSION_INIT2(pApi)

sqlite3Fts3UnicodeSnTokenizer(&tokenizer);

registerTokenizer(db, TOKENIZER_NAME, tokenizer);

return 0;
}



35 changes: 35 additions & 0 deletions fts3Int.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
** 2009 Nov 12
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
*/
#ifndef _FTSINT_H
#define _FTSINT_H

#include "sqlite3.h"
#include "fts3_tokenizer.h"

typedef unsigned char u8; /* 1-byte (or larger) unsigned integer */
typedef short int i16; /* 2-byte (or larger) signed integer */
typedef unsigned int u32; /* 4-byte unsigned integer */
typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */
typedef sqlite3_int64 i64; /* 8-byte signed integer */

#define UNUSED_PARAMETER(x) (void)(x)

/* fts3_unicode2.c (functions generated by parsing unicode text files) */
#ifdef SQLITE_ENABLE_FTS4_UNICODE61
int sqlite3FtsUnicodeFold(int, int);
int sqlite3FtsUnicodeIsalnum(int);
int sqlite3FtsUnicodeIsdiacritic(int);
#endif

#endif /* _FTSINT_H */
161 changes: 161 additions & 0 deletions fts3_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS3_TOKENIZER_H_
#define _FTS3_TOKENIZER_H_

/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
#include "sqlite3.h"

/*
** Structures used by the tokenizer interface. When a new tokenizer
** implementation is registered, the caller provides a pointer to
** an sqlite3_tokenizer_module containing pointers to the callback
** functions that make up an implementation.
**
** When an fts3 table is created, it passes any arguments passed to
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
** implementation. The xCreate() function in turn returns an
** sqlite3_tokenizer structure representing the specific tokenizer to
** be used for the fts3 table (customized by the tokenizer clause arguments).
**
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
** method is called. It returns an sqlite3_tokenizer_cursor object
** that may be used to tokenize a specific input buffer based on
** the tokenization rules supplied by a specific sqlite3_tokenizer
** object.
*/
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;

struct sqlite3_tokenizer_module {

/*
** Structure version. Should always be set to 0 or 1.
*/
int iVersion;

/*
** Create a new tokenizer. The values in the argv[] array are the
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
** TABLE statement that created the fts3 table. For example, if
** the following SQL is executed:
**
** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
**
** then argc is set to 2, and the argv[] array contains pointers
** to the strings "arg1" and "arg2".
**
** This method should return either SQLITE_OK (0), or an SQLite error
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
** to point at the newly created tokenizer structure. The generic
** sqlite3_tokenizer.pModule variable should not be initialised by
** this callback. The caller will do so.
*/
int (*xCreate)(
int argc, /* Size of argv array */
const char *const*argv, /* Tokenizer argument strings */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
);

/*
** Destroy an existing tokenizer. The fts3 module calls this method
** exactly once for each successful call to xCreate().
*/
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);

/*
** Create a tokenizer cursor to tokenize an input buffer. The caller
** is responsible for ensuring that the input buffer remains valid
** until the cursor is closed (using the xClose() method).
*/
int (*xOpen)(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
);

/*
** Destroy an existing tokenizer cursor. The fts3 module calls this
** method exactly once for each successful call to xOpen().
*/
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);

/*
** Retrieve the next token from the tokenizer cursor pCursor. This
** method should either return SQLITE_OK and set the values of the
** "OUT" variables identified below, or SQLITE_DONE to indicate that
** the end of the buffer has been reached, or an SQLite error code.
**
** *ppToken should be set to point at a buffer containing the
** normalized version of the token (i.e. after any case-folding and/or
** stemming has been performed). *pnBytes should be set to the length
** of this buffer in bytes. The input text that generated the token is
** identified by the byte offsets returned in *piStartOffset and
** *piEndOffset. *piStartOffset should be set to the index of the first
** byte of the token in the input buffer. *piEndOffset should be set
** to the index of the first byte just past the end of the token in
** the input buffer.
**
** The buffer *ppToken is set to point at is managed by the tokenizer
** implementation. It is only required to be valid until the next call
** to xNext() or xClose().
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xNext)(
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
int *piPosition /* OUT: Number of tokens returned before this one */
);

/***********************************************************************
** Methods below this point are only available if iVersion>=1.
*/

/*
** Configure the language id of a tokenizer cursor.
*/
int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
};

struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};

struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};

int fts3_global_term_cnt(int iTerm, int iCol);
int fts3_term_cnt(int iTerm, int iCol);


#endif /* _FTS3_TOKENIZER_H_ */
Loading

0 comments on commit 5f5d729

Please sign in to comment.