//
// Aspell's main word list data is stored in 4 large blocks of memory
//
// * The Word Hash Table
// * The Word List
// * The Soundslike Hash Table
// * The Soundslike List
//
// 1a) The Word Hash Table
// This consists of an open address hash table which contains pointers
// to the actual words in the word list
//
// 1b) The Word List
// This consists of the actual word list and is layed out as follows:
//   <Word1><null char><Word2><null char>...
//
// 2a) The Soundslike Hash Table
// This consists of an open address hash table which contains pointers
// to a soundslike object.
//
// 2b) The Soundslike Object
// The soundslike object is layed out as follow:
//  What:  <Word1 pointer><Word2 p.>...<Num of Words><Soundslike><null char>
//  Types: <const char *><const char *>...<unsigned short int><char[]><char>

//         <unsigned int><unsigned int>...<unsigned short int><char[]><char>
// The pointer to the object points to the beginning of the Soundslike string
// The Word pointers consists of the the words which have the same 
//   soundslike pattern
//
// 2c) The Soundslike List
// This consists of Soundslike Objects back to back:
//  <Soundslike object 1><Soundslike object 2> ...
// There is no delimiter between the objects
//
//
//                          Format of the *.wrd files
//
// (This part is in ascii format)
// <"master_wl"><ws><lang name><ws><# words><ws>
//     <hash size><ws><size of list block><\n>
// (The rest is in binary format>
// <Wordlist>
// <Word Hash Table>
//
// The word hash table is a vector of unsigned its which contains an offset
// of where they can be found in the word list.
//
//                          Format of the *.sl files
//
// (This part is in ascii format)
// <"master_wl"><ws><lang name><ws><# words><ws>
//     <hash size><ws><size of list block><\n>
// (The rest is in binary format>
// <Soundslike object list>
// <Soundslike Hash Table>
//
// Soundslike oject is laid out as follows:
//   <Num of Words><Word 1 offset>...<Soundslike><\0>
//   <unsigned short int><unsigned int>...<char[]><char>
// And like the .wrd file the hash table contains offsets not pointers.
//

#include <string>
#include <cstring>
#include <iostream>
#include <fstream>
#include <vector>

// POSIX headers
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>

#include "hash_string.hh"
#include "vector_hash-t.hh"
#include "block_vector.hh"
#include "data.hh"
#include "file_exceps.hh"
#include "file_util.hh"
#include "data_util.hh"
#include "language.hh"
#include "config.hh"

typedef unsigned int   u32int;
static const u32int u32int_max = (u32int)-1;
typedef unsigned short u16int;

namespace aspell_default_readonly_ws {

  using namespace aspell;
  using namespace autil;
  using namespace aspell::data_util;

  /////////////////////////////////////////////////////////////////////
  // 
  //  ReadOnlyWS
  //
    
  class ReadOnlyWS : public BasicWordSet
  {
      
  private:

    struct WordLookupParms {
      const char * block_begin;
      WordLookupParms() {}
      WordLookupParms(const char * b)  : block_begin(b) {}
      typedef BlockVector<const u32int> Vector;
      typedef u32int                    Value;
      typedef const char *              Key;
      Key key(Value v) const {return block_begin + v;}
      HashString<const char *> hash;
      bool equal(Key rhs, Key lhs) const {return strcmp(rhs,lhs) == 0;}
      bool is_nonexistent(Value v) const {return v == u32int_max;}
      void make_nonexistent(const Value & v) const {abort();}
    };
    typedef VectorHashTable<WordLookupParms> WordLookup;
    
    struct SoundslikeLookupParms {
      const char * block_begin;
      SoundslikeLookupParms() {}
      SoundslikeLookupParms(const char * b) : block_begin(b) {}
      typedef BlockVector<const u32int> Vector;
      typedef u32int                    Value;
      typedef const char *              Key;
      Key key(Value v) const {return block_begin + v;}
      HashString<const char *> hash;
      bool equal(Key rhs, Key lhs) const {return strcmp(rhs,lhs) == 0;}
      bool is_nonexistent(Value v) const {return v == u32int_max;}
      void make_nonexistent(const Value & v) const {abort();}
    };
    typedef VectorHashTable<SoundslikeLookupParms> SoundslikeLookup;
      
    char *           block;
    u32int           block_size;
    WordLookup       word_lookup;
    const char *     word_block;
    SoundslikeLookup soundslike_lookup;
    const char *     soundslike_block;
    
    string file_name_;

    ReadOnlyWS(const ReadOnlyWS&);
    ReadOnlyWS& operator= (const ReadOnlyWS&);

    struct ElementsParms;
    struct SoundslikeElementsParms;
    struct SoundslikeWordsParms;

  public:
    VirEmul * elements() const;
    Size      size()     const;
    bool      empty()    const;
      
    ReadOnlyWS() {
      block = 0;
    }

    ~ReadOnlyWS() {
      if (block)
	munmap(block, block_size);
    }
      
    void load(const string &, const Config *);
    const char * file_name() const;
    bool operator [] (const char *word) const;
    bool operator [] (const string &word) const;
      
    VirEmul * words_w_soundslike(const char * soundslike) const;
    VirEmul * words_w_soundslike(SoundslikeWord soundslike) const;

    VirSoundslikeEmul * soundslike_elements() const;
  };
    
  const char * ReadOnlyWS::file_name() const {
    return file_name_.c_str();
  }

  //
  //  
  //

  struct ReadOnlyWS::ElementsParms {
    typedef const char *               Value;
    typedef WordLookup::const_iterator Iterator; 
    const char * word_block_begin;
    ElementsParms(const char * b) : word_block_begin(b) {}
    bool endf(const Iterator & i) const {return i.at_end();}
    Value end_state() const {return 0;}
    Value deref(const Iterator & i) const {return word_block_begin + *i;}
  };

  ReadOnlyWS::VirEmul * ReadOnlyWS::elements() const {
    return new MakeVirEmulation<ElementsParms>
      (word_lookup.begin(), ElementsParms(block));
  }

  ReadOnlyWS::Size ReadOnlyWS::size() const {
    return word_lookup.size();
  }
  
  bool ReadOnlyWS::empty() const {
    return word_lookup.empty();
  }

  struct DataHead {
    // all sizes except the last two must to divisible by
    // sysconf(_SC_PAGESIZE);
    char check_word[16];
    u32int head_size;
    u32int total_block_size;
    u32int word_block_size;
    u32int word_count;
    u32int word_buckets;
    u32int word_size;
    u32int soundslike_block_size;
    u32int soundslike_count;
    u32int soundslike_buckets;
    u32int soundslike_size;
    u32int lang_name_size;
    u32int soundslike_version_size;
  };

  void ReadOnlyWS::load(const string & b,
			const Config * config)
  {
    file_name_ = b;

    int fd;
    fd = open(file_name_.c_str(), O_RDONLY);
    if (fd == -1) 
      throw CantReadFile(file_name_);

    DataHead data_head;

    read(fd, &data_head, sizeof(DataHead));

    if (strcmp(data_head.check_word, "aspell rowl 1.1") != 0)
      throw BadFileFormat(file_name_);

    char * word = new char[data_head.lang_name_size];
    read(fd, word, data_head.lang_name_size);

    try {
      set_check_lang(word,config);
    } catch (RethrowWFile & e) {
      e.rethrow_w_file(file_name_);
    }
    
    delete[] word;

    word = new char[data_head.soundslike_version_size];
    read(fd, word, data_head.soundslike_version_size);

    if (strcmp(word, lang()->soundslike_version()) != 0)
      throw BadFileFormat(file_name_, "Wrong Soundslike Version");

    delete[] word;

    block_size = data_head.total_block_size;
    block = static_cast<char *>
      (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, data_head.head_size));
    assert (block != (char *)MAP_FAILED);

    word_block       = block;
    soundslike_block = block + data_head.word_block_size + data_head.word_size;

    word_lookup.parms().block_begin = word_block;
    const u32int * begin = reinterpret_cast<const u32int *>
      (word_block + data_head.word_block_size);
    word_lookup.vector().set(begin, begin + data_head.word_buckets);
    word_lookup.set_size(data_head.word_count);

    soundslike_lookup.parms().block_begin = soundslike_block;
    begin = reinterpret_cast<const u32int *>
      (soundslike_block + data_head.soundslike_block_size);
    soundslike_lookup.vector().set(begin,
				   begin + data_head.soundslike_buckets);
    soundslike_lookup.set_size(data_head.soundslike_count);

    close(fd);
  }

  bool ReadOnlyWS::operator [] (const char * word) const {
    return word_lookup.count(word);
  }

  bool ReadOnlyWS::operator [] (const string & word) const {
    return word_lookup.count(word.c_str());
  }

  struct ReadOnlyWS::SoundslikeWordsParms {
    typedef const char *               Value;
    typedef const u32int *             Iterator;
    const char * word_block_begin;
    Iterator     end;
    SoundslikeWordsParms(const char * b, Iterator e) 
      : word_block_begin(b), end(e) {}
    bool endf(Iterator i) const {return i == end;}
    Value end_state() const {return 0;}
    Value deref(Iterator i) const {return word_block_begin + *i;}
  };

  struct ReadOnlyWS::SoundslikeElementsParms {
    typedef SoundslikeWord                   Value;
    typedef SoundslikeLookup::const_iterator Iterator;

    const char * soundslike_block_begin;
      
    SoundslikeElementsParms(const char * b) 
      : soundslike_block_begin(b) {}
      
    bool endf(Iterator i) const {return i.at_end();}
    
    Value deref(Iterator i) {
      return Value(soundslike_block_begin + *i, 0);
    }

    Value end_state() {return Value(0,0);}
  };

  ReadOnlyWS::VirSoundslikeEmul * ReadOnlyWS::soundslike_elements() const {
      
    return new MakeVirEmulation<SoundslikeElementsParms>
      (soundslike_lookup.begin(), soundslike_block);
  }

  ReadOnlyWS::VirEmul * 
  ReadOnlyWS::words_w_soundslike(const char * soundslike) const {

    SoundslikeLookup::const_iterator i = soundslike_lookup.find(soundslike);
    if (i == soundslike_lookup.end()) { 
      return new MakeAlwaysEndEmulation<const char *>();
    } else {
      return ReadOnlyWS::words_w_soundslike
	(SoundslikeWord(soundslike_block + *i, 0));
    }

  }

  ReadOnlyWS::VirEmul *
  ReadOnlyWS::words_w_soundslike(SoundslikeWord w) const {

    const u32int * end = reinterpret_cast<const u32int *>(w.soundslike - 2);
    u16int size = *reinterpret_cast<const u16int *>(end);
    
    return new MakeVirEmulation<SoundslikeWordsParms>
      (end - size, SoundslikeWordsParms(word_block, end));
    
  }

}  

namespace aspell {

  BasicWordSet * new_default_readonly_word_set() {
    return new aspell_default_readonly_ws::ReadOnlyWS();
  }
  
}

namespace aspell_default_readonly_ws {

  using namespace aspell;
  using namespace autil;

  struct WordLookupParms {
    typedef vector<string> Vector;
    typedef string         Value;
    typedef string         Key;
    const Key & key(const Value & v) const {return v;}
    HashString<string>     hash;
    bool equal(const Key & rhs, const Key & lhs) const {return rhs == lhs;}
    bool is_nonexistent(const Value & v) const {return v.empty();}
    void make_nonexistent(Value & v) const {v.resize(0);}
  };

  typedef VectorHashTable<WordLookupParms> WordHash;

  struct SoundslikeLookupParms {
    typedef string                      Key;
    typedef pair<Key, vector<u32int> >  Value;
    typedef vector<Value>               Vector;
    const Key & key(const Value & v) const {return v.first;}
    HashString<string>  hash;
    bool equal(const Key & rhs, const Key & lhs) const {return rhs == lhs;}
    bool is_nonexistent(const Value & v) const {return v.first.empty();}
    void make_nonexistent(Value & v) const {
      v.first.resize(0); 
      v.second.resize(0);
    }
  };

  typedef VectorHashTable<SoundslikeLookupParms> SoundHash;

  static inline unsigned int round_up(unsigned int i, unsigned int size) {
    return ((i + size - 1)/size)*size;
  }
  
  void create (const string & base, 
	       VirEmulation<const char *> * els,
	       const Language & lang) 
  {
    size_t page_size = (size_t) sysconf (_SC_PAGESIZE);

    assert(sizeof(u16int) == 2);
    assert(sizeof(u32int) == 4);

    ofstream OUT;
    OUT.open(base.c_str(), ios::out | ios::bin);

    DataHead data_head;
    memset(&data_head, 0, sizeof(data_head));
    strcpy(data_head.check_word, "aspell rowl 1.1");

    data_head.lang_name_size          = strlen(lang.name()) + 1;
    data_head.soundslike_version_size = strlen(lang.soundslike_version()) + 1;
    data_head.head_size  = sizeof(DataHead);
    data_head.head_size += data_head.lang_name_size;
    data_head.head_size += data_head.soundslike_version_size;
    data_head.head_size  = round_up(data_head.head_size, page_size);

    SoundHash sound_prehash;
    {
      WordHash word_hash;
      const char * w;
      while ( (w = els->next()) != 0)
	word_hash.insert(w);
      delete els;
      
      word_hash.resize(word_hash.size()/.8);
      
      OUT.seekp(data_head.head_size);
      u32int start = data_head.head_size;
      
      vector<u32int> final_hash(word_hash.bucket_count(), u32int_max);
      
      OUT.put('\0');
      for (unsigned int i = 0; i != word_hash.vector().size(); ++i) {
	const string & value = word_hash.vector()[i];
	
	if (word_hash.parms().is_nonexistent(value)) continue;

	final_hash[i] = OUT.tellp() - start;
	OUT << value << '\0';

	SoundHash::value_type to_insert;
	to_insert.first = lang.to_soundslike(value);
	sound_prehash.insert(to_insert).first->second.push_back(final_hash[i]);
      }

      data_head.word_block_size = round_up(OUT.tellp() - start + 1, 
					   page_size);
      data_head.total_block_size = data_head.word_block_size;

      OUT.seekp(data_head.head_size + data_head.total_block_size);
      OUT.write(reinterpret_cast<const char *>(&final_hash.front()),
		final_hash.size() * 4);

      data_head.word_count   = word_hash.size();
      data_head.word_buckets = word_hash.bucket_count();
      data_head.word_size    
	= round_up(word_hash.bucket_count() * 4, page_size);
      data_head.total_block_size += data_head.word_size;
      OUT.seekp(data_head.head_size + data_head.total_block_size);
    }

    sound_prehash.resize(sound_prehash.size()/.8);

    {
      vector<u32int> final_hash(sound_prehash.bucket_count(), u32int_max);

      u32int start = OUT.tellp();

      for (unsigned int i = 0; i != sound_prehash.vector().size(); ++i) {
	const SoundHash::value_type & value = sound_prehash.vector()[i];

	if (sound_prehash.parms().is_nonexistent(value)) continue;

	u16int count = value.second.size();

	OUT.write(reinterpret_cast<const char *>(&value.second.front()),
		  count * 4);

	OUT.write(reinterpret_cast<char *>(&count),2);

	final_hash[i] = OUT.tellp() - start;

	OUT << value.first << '\0';

	OUT.seekp(round_up(OUT.tellp(), 4));
      }
      data_head.soundslike_block_size 
	= round_up(OUT.tellp() - start, page_size);
      data_head.total_block_size += data_head.soundslike_block_size;

      OUT.seekp(data_head.head_size + data_head.total_block_size);
      OUT.write(reinterpret_cast<char *>(&final_hash.front()),
		final_hash.size() * 4);

      data_head.soundslike_count   = sound_prehash.size();
      data_head.soundslike_buckets = sound_prehash.bucket_count();
      data_head.soundslike_size    
	= round_up(final_hash.size() * 4, page_size);
      data_head.total_block_size += data_head.soundslike_size;
    
    }

    // write data head to file
    OUT.seekp(0);
    OUT.write((char *)&data_head, sizeof(DataHead));
    OUT.write(lang.name(), data_head.lang_name_size);
    OUT.write(lang.soundslike_version(), data_head.soundslike_version_size);
  }

}

namespace aspell {
  void create_default_readonly_word_set(VirEmulation<const char *> * els,
					const Config & config)
  {
    Language lang(config);
    aspell_default_readonly_ws::create(config.retrieve("master-path"),
				       els,lang);
  }
}
