/** @file
 * @brief MatchSpy implementation.
 */
/* Copyright (C) 2007-2024 Olly Betts
 * Copyright (C) 2007,2009 Lemur Consulting Ltd
 * Copyright (C) 2010 Richard Boulton
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see
 * <https://www.gnu.org/licenses/>.
 */

#include <config.h>
#include <xapian/matchspy.h>

#include <xapian/document.h>
#include <xapian/error.h>
#include <xapian/queryparser.h>
#include <xapian/registry.h>

#include <map>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

#include "debuglog.h"
#include "heap.h"
#include "omassert.h"
#include "pack.h"
#include "stringutils.h"
#include "str.h"
#include "termlist.h"

using namespace std;
using namespace Xapian;
using Xapian::Internal::intrusive_ptr;

MatchSpy::~MatchSpy() {}

MatchSpy *
MatchSpy::clone() const {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
}

string
MatchSpy::name() const {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
}

string
MatchSpy::serialise() const {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
}

MatchSpy *
MatchSpy::unserialise(const string &, const Registry &) const {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
}

string
MatchSpy::serialise_results() const {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
}

void
MatchSpy::merge_results(const string &) {
    throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
}

string
MatchSpy::get_description() const {
    return "Xapian::MatchSpy()";
}

[[noreturn]]
static void unsupported_method() {
    throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
}

/// A termlist iterator over the contents of a ValueCountMatchSpy
class ValueCountTermList final : public TermList {
  private:
    map<string, Xapian::doccount>::const_iterator it;
    bool started;
    intrusive_ptr<Xapian::ValueCountMatchSpy::Internal> spy;
  public:

    explicit ValueCountTermList(ValueCountMatchSpy::Internal * spy_)
	: spy(spy_)
    {
	it = spy->values.begin();
	started = false;
    }

    Xapian::doccount get_termfreq() const {
	Assert(started);
	Assert(it != spy->values.end());
	return it->second;
    }

    TermList * next() {
	if (!started) {
	    started = true;
	} else {
	    Assert(it != spy->values.end());
	    ++it;
	}
	if (it == spy->values.end()) {
	    return this;
	}
	current_term = it->first;
	return NULL;
    }

    TermList* skip_to(string_view term) {
	while (it != spy->values.end() && it->first < term) {
	    ++it;
	}
	started = true;
	if (it == spy->values.end()) {
	    return this;
	}
	current_term = it->first;
	return NULL;
    }

    Xapian::termcount get_approx_size() const { unsupported_method(); }
    Xapian::termcount get_wdf() const { unsupported_method(); }
    PositionList* positionlist_begin() const { unsupported_method(); }
    Xapian::termcount positionlist_count() const { unsupported_method(); }
};

/** A string with a corresponding frequency.
 */
class StringAndFrequency {
    std::string str;
    Xapian::doccount frequency;
  public:
    /// Construct a StringAndFrequency object.
    StringAndFrequency(const std::string & str_, Xapian::doccount frequency_)
	    : str(str_), frequency(frequency_) {}

    /// Return the string.
    std::string get_string() const { return str; }

    /// Return the frequency.
    Xapian::doccount get_frequency() const { return frequency; }
};

/** Compare two StringAndFrequency objects.
 *
 *  The comparison is firstly by frequency (higher is better), then by string
 *  (earlier lexicographic sort is better).
 */
class StringAndFreqCmpByFreq {
  public:
    /// Default constructor
    StringAndFreqCmpByFreq() {}

    /// Return true if a has a higher frequency than b.
    /// If equal, compare by the str, to provide a stable sort order.
    bool operator()(const StringAndFrequency &a,
		    const StringAndFrequency &b) const {
	if (a.get_frequency() > b.get_frequency()) return true;
	if (a.get_frequency() < b.get_frequency()) return false;
	return a.get_string() < b.get_string();
    }
};

/// A termlist iterator over a vector of StringAndFrequency objects.
class StringAndFreqTermList final : public TermList {
  private:
    vector<StringAndFrequency>::const_iterator it;
    bool started;
  public:
    vector<StringAndFrequency> values;

    /** init should be called after the values have been set, but before
     *  iteration begins.
     */
    void init() {
	it = values.begin();
	started = false;
    }

    Xapian::doccount get_termfreq() const {
	Assert(started);
	Assert(it != values.end());
	return it->get_frequency();
    }

    TermList * next() {
	if (!started) {
	    started = true;
	} else {
	    Assert(it != values.end());
	    ++it;
	}
	if (it == values.end()) {
	    return this;
	}
	current_term = it->get_string();
	return NULL;
    }

    TermList* skip_to(string_view term) {
	while (it != values.end() && it->get_string() < term) {
	    ++it;
	}
	started = true;
	if (it != values.end()) {
	    current_term = it->get_string();
	}
	return NULL;
    }

    Xapian::termcount get_approx_size() const { unsupported_method(); }
    Xapian::termcount get_wdf() const { unsupported_method(); }
    PositionList* positionlist_begin() const { unsupported_method(); }
    Xapian::termcount positionlist_count() const { unsupported_method(); }
};

/** Get the most frequent items from a map from string to frequency.
 *
 *  This takes input such as that in ValueCountMatchSpy::Internal::values and
 *  returns a vector of the most frequent items in the input.
 *
 *  @param result A vector which will be filled with the most frequent
 *                items, in descending order of frequency.  Items with
 *                the same frequency will be sorted in ascending
 *                alphabetical order.
 *
 *  @param items The map from string to frequency, from which the most
 *               frequent items will be selected.
 *
 *  @param maxitems The maximum number of items to return (non-zero).
 */
static void
get_most_frequent_items(vector<StringAndFrequency> & result,
			const map<string, doccount> & items,
			size_t maxitems)
{
    Assert(maxitems != 0);
    result.clear();
    result.reserve(maxitems);
    StringAndFreqCmpByFreq cmpfn;
    bool is_heap = false;

    for (map<string, doccount>::const_iterator i = items.begin();
	 i != items.end(); ++i) {
	if (result.size() < maxitems) {
	    result.emplace_back(i->first, i->second);
	    continue;
	}

	// We have the desired number of items, so it's one-in one-out from
	// now on.
	Assert(result.size() == maxitems);
	if (!is_heap) {
	    Heap::make(result.begin(), result.end(), cmpfn);
	    is_heap = true;
	}

	StringAndFrequency new_item(i->first, i->second);
	if (!cmpfn(new_item, result[0])) {
	    // The candidate is worse than the worst of the current top N.
	    continue;
	}

	result[0] = std::move(new_item);
	Heap::replace(result.begin(), result.end(), cmpfn);
    }

    if (is_heap) {
	Heap::sort(result.begin(), result.end(), cmpfn);
    } else {
	sort(result.begin(), result.end(), cmpfn);
    }
}

void
ValueCountMatchSpy::operator()(const Document &doc, double) {
    Assert(internal);
    ++(internal->total);
    string val(doc.get_value(internal->slot));
    if (!val.empty()) ++(internal->values[val]);
}

TermIterator
ValueCountMatchSpy::values_begin() const
{
    Assert(internal);
    return Xapian::TermIterator(new ValueCountTermList(internal.get()));
}

TermIterator
ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
{
    Assert(internal);
    unique_ptr<StringAndFreqTermList> termlist;
    if (usual(maxvalues > 0)) {
	termlist.reset(new StringAndFreqTermList);
	get_most_frequent_items(termlist->values, internal->values, maxvalues);
	termlist->init();
    }
    return Xapian::TermIterator(termlist.release());
}

MatchSpy *
ValueCountMatchSpy::clone() const {
    Assert(internal);
    return new ValueCountMatchSpy(internal->slot);
}

string
ValueCountMatchSpy::name() const {
    return "Xapian::ValueCountMatchSpy";
}

string
ValueCountMatchSpy::serialise() const {
    Assert(internal);
    string result;
    pack_uint_last(result, internal->slot);
    return result;
}

MatchSpy *
ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
{
    const char * p = s.data();
    const char * end = p + s.size();

    valueno new_slot;
    if (!unpack_uint_last(&p, end, &new_slot)) {
	unpack_throw_serialisation_error(p);
    }

    return new ValueCountMatchSpy(new_slot);
}

string
ValueCountMatchSpy::serialise_results() const {
    LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
    Assert(internal);
    string result;
    pack_uint(result, internal->total);
    for (auto&& item : internal->values) {
	pack_string(result, item.first);
	pack_uint(result, item.second);
    }
    RETURN(result);
}

void
ValueCountMatchSpy::merge_results(const string & s) {
    LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
    Assert(internal);
    const char * p = s.data();
    const char * end = p + s.size();

    Xapian::doccount n;
    if (!unpack_uint(&p, end, &n)) {
	unpack_throw_serialisation_error(p);
    }
    internal->total += n;

    string val;
    while (p != end) {
	doccount freq;
	if (!unpack_string(&p, end, val) ||
	    !unpack_uint(&p, end, &freq)) {
	    unpack_throw_serialisation_error(p);
	}
	internal->values[val] += freq;
    }
}

string
ValueCountMatchSpy::get_description() const {
    string d = "ValueCountMatchSpy(";
    if (internal) {
	d += str(internal->total);
	d += " docs seen, looking in ";
	d += str(internal->values.size());
	d += " slots)";
    } else {
	d += ")";
    }
    return d;
}
