Newer
Older
minerva / Userland / Libraries / LibLocale / Segmenter.cpp
@minerva minerva on 13 Jul 5 KB Initial commit
/*
 * Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/BinarySearch.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <LibLocale/Locale.h>
#include <LibLocale/Segmenter.h>
#include <LibUnicode/Segmentation.h>

namespace Locale {

SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
{
    if (segmenter_granularity == "grapheme"sv)
        return SegmenterGranularity::Grapheme;
    if (segmenter_granularity == "sentence"sv)
        return SegmenterGranularity::Sentence;
    if (segmenter_granularity == "word"sv)
        return SegmenterGranularity::Word;
    VERIFY_NOT_REACHED();
}

StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
{
    switch (segmenter_granularity) {
    case SegmenterGranularity::Grapheme:
        return "grapheme"sv;
    case SegmenterGranularity::Sentence:
        return "sentence"sv;
    case SegmenterGranularity::Word:
        return "word"sv;
    }
    VERIFY_NOT_REACHED();
}

class SegmenterImpl : public Segmenter {
public:
    SegmenterImpl(SegmenterGranularity segmenter_granularity)
        : Segmenter(segmenter_granularity)
    {
    }

    virtual ~SegmenterImpl() override = default;

    virtual NonnullOwnPtr<Segmenter> clone() const override
    {
        return make<SegmenterImpl>(m_segmenter_granularity);
    }

    virtual void set_segmented_text(String text) override
    {
        m_string_storage = move(text);
        set_text(m_string_storage.code_points());
    }

    virtual void set_segmented_text(Utf16View const& text) override
    {
        set_text(text);
    }

    void set_segmented_text(Utf32View const& text)
    {
        set_text(text);
    }

    virtual size_t current_boundary() override
    {
        return m_current_boundary;
    }

    virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
    {
        recompute_boundaries_if_necessary();

        if (inclusive == Inclusive::Yes)
            ++boundary;

        auto lower_bound_index = strict_lower_bound(m_boundaries.span(), boundary);

        if (!lower_bound_index.has_value())
            return OptionalNone {};
        m_current_boundary = m_boundaries[*lower_bound_index];
        return m_current_boundary;
    }

    virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
    {
        recompute_boundaries_if_necessary();

        if (inclusive == Inclusive::Yes)
            --boundary;

        auto upper_bound_index = upper_bound(m_boundaries.span(), boundary);

        if (!upper_bound_index.has_value())
            return OptionalNone {};
        m_current_boundary = m_boundaries[*upper_bound_index];
        return m_current_boundary;
    }

    virtual void for_each_boundary(String text, SegmentationCallback callback) override
    {
        for_each_segmentation_boundary(text.code_points(), move(callback));
    }

    virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
    {
        for_each_segmentation_boundary(text, move(callback));
    }

    virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
    {
        for_each_segmentation_boundary(text, move(callback));
    }

    virtual bool is_current_boundary_word_like() const override
    {
        // FIXME: Implement one day.
        return false;
    }

private:
    void set_text(Variant<Utf8View, Utf16View, Utf32View> text)
    {
        m_segmented_text = text;
        m_must_recompute_boundaries = true;
    }

    void recompute_boundaries_if_necessary()
    {
        if (!m_must_recompute_boundaries)
            return;

        m_boundaries.clear();
        auto callback = [&](size_t boundary) {
            m_boundaries.append(boundary);
            return IterationDecision::Continue;
        };
        m_segmented_text.visit([&](auto const& text) { return for_each_segmentation_boundary(text, move(callback)); });
        m_must_recompute_boundaries = false;
    }

    template<class T>
    void for_each_segmentation_boundary(T const& text, SegmentationCallback callback)
    {
        switch (segmenter_granularity()) {
        case SegmenterGranularity::Grapheme:
            Unicode::for_each_grapheme_segmentation_boundary(text, move(callback));
            break;
        case SegmenterGranularity::Sentence:
            Unicode::for_each_sentence_segmentation_boundary(text, move(callback));
            break;
        case SegmenterGranularity::Word:
            Unicode::for_each_word_segmentation_boundary(text, move(callback));
            break;
        }
    }

    bool m_must_recompute_boundaries { true };
    Vector<size_t> m_boundaries;
    size_t m_current_boundary { 0 };
    String m_string_storage;
    Variant<Utf8View, Utf16View, Utf32View> m_segmented_text { Utf8View {} };
};

NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
{
    return Segmenter::create(default_locale(), segmenter_granularity);
}

NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
{
    // FIXME: Implement locale-specific segmentation.
    (void)locale;
    return make<SegmenterImpl>(segmenter_granularity);
}

}