"id";"title";"slug";"acronym";"url";"organization";"organization_id";"description";"frequency";"license";"temporal_coverage.start";"temporal_coverage.end";"spatial.granularity";"spatial.zones";"private";"featured";"created_at";"last_modified";"tags";"archived";"resources_count";"harvest.backend";"harvest.domain";"harvest.created_at";"harvest.modified_at";"quality_score";"metric.discussions";"metric.reuses";"metric.followers";"metric.views" "64ee072ff1b5a534ce7a4ed3";"A n-grams collection extracted from the Portuguese Web";"a-n-grams-collection-extracted-from-the-portuguese-web";"";"https://dados.gov.pt/pt/datasets/a-n-grams-collection-extracted-from-the-portuguese-web/";"Arquivo.pt - pesquise páginas do passado";"6087fbf7454ae34be8d6ce4e";"The n-grams collection was extracted from the collected documents whose identified language was Portuguese. We extracted word n-grams up to the fifht order (5-grams). A set of regular expressions to tokenize the text were applied. After the extraction, all n-grams with tokens having more than 32 characters were discarded. N-grams with frequencies below 5 were discarded as well. The n-grams collection is available as a set of UTF-8 encoded files, containing the n-grams and their frequencies (2010-11-10). This collection was build by David Batista, winner of the [2nd place of the Arquivo.pt award 2021](https://arquivo.pt/winners2021) with the work [Politiquices.pt](https://www.politiquices.pt/). Related publication: https://www.davidsbatista.net/assets/documents/publications/WPT05_fala2010.pdf Also published at [Harvard Dataverse](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ZSXC55)";"punctual";"Creative Commons CCZero";"1996-01-01";"2022-12-02";"country";"Portugal";False;False;"2023-08-29T15:56:47.976000";"2023-08-29T16:12:02.062000";"n-grams-portuguese";False;1;"";"";"";"";"1.00";0;0;0;0