Spaces:

cdleong
/

langcode-search

Sleeping

App Files Files Community

cdleong commited on Nov 18, 2021

Commit

54d3963

1 Parent(s): 741cd0d

minor fix on "Python"

Browse files

Files changed (1) hide show

app.py +47 -26

app.py CHANGED Viewed

@@ -5,15 +5,29 @@ import urllib
 import requests
 # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
 # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
 # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
 # TODO: add in vachan search even if lang not found
 # TODO: results from glottolog even if none from others
 things_to_test = [
   "knh", # deprecated code on ISO
   "khn", # only has 639-3 on ISO
   "xxx", # no such code on ISO or glottolog
   "Chinese", # Vachan struggles.
 ]
@@ -76,26 +90,26 @@ def main():
     return
   if langtext.lower() == "python":
-    st.success("[Python is the best language!(https://www.python.org/)")
     return
   # TODO: st.code() for these "lookup in progress" outputs.
-  st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
   if langcodes.tag_is_valid(langtext):
-    st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
   else:
-    st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
   try:
     lang = langcodes.Language.get(langtext)
   #  st.write(f"{lang} is the BCP-47 tag.")
     if "unknown" in lang.display_name().lower():
-      st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
       lang = None
   except langcodes.LanguageTagError as e:
-    st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
     lang = None
@@ -104,16 +118,19 @@ def main():
     try:
       found = langcodes.find(langtext)
       lang = found
-      st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
     except LookupError as e:
-      st.write("## Result: failure!")
-      st.write(f"Unable to look up language code. But all hope is not lost...")
       st.write(f"* You can also try https://r12a.github.io/app-subtags/")
       st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
       lang = None
   #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
   if lang is not None:
@@ -121,12 +138,12 @@ def main():
     b_variant = lang.to_alpha3(variant='B')
     t_variant = lang.to_alpha3(variant='T')
     broader_tags = lang.broader_tags()
-    results_from_vachan = try_searching_vachan_engine(langtext)
     standardized_tag = langcodes.standardize_tag(lang)
-    languoid_id = try_retrieving_glottolog_id(langtext)
-    st.write(f"## Results: probably use '{standardized_tag}'")
     # TODO: make a results dictionary so it's easy to copy-paste?
     st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
     st.write(f"Breakdown of tag components:")
@@ -143,15 +160,8 @@ def main():
     st.write(f"### Language Subtag Search Tool")
     st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
-    st.write(f"### Glottolog")
-    if languoid_id:
-      st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
-    st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
-    if t_variant != b_variant:
-      st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
-    st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
     st.write("### Older / Related Codes")
     st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
@@ -172,11 +182,22 @@ def main():
     elif b_obsolete_codes:
       st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
       st.write(b_obsolete_codes)
-    if results_from_vachan:
-      st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
-      st.write(results_from_vachan)
 if __name__ == "__main__":

 import requests
 # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
+# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
 # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
 # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
 # TODO: add in vachan search even if lang not found
 # TODO: results from glottolog even if none from others
+# TODO: check glottolog results to see if they find anything!
 things_to_test = [
   "knh", # deprecated code on ISO
   "khn", # only has 639-3 on ISO
   "xxx", # no such code on ISO or glottolog
   "Chinese", # Vachan struggles.
+  "Mandarin", # Vachan struggles.
+  "zh-CN",
+  "Chinese",
+  "zh-Latn-pinyin",
+  "en-Latn-US",
+  "en",
+  "English",
+  "fr-CA",
+  "French (Canada)",
+  "français",
+  "法语",
+  "", # empty string
 ]
     return
   if langtext.lower() == "python":
+    st.success("[Python is the best language!](https://www.python.org/)")
     return
   # TODO: st.code() for these "lookup in progress" outputs.
+  st.info("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
   if langcodes.tag_is_valid(langtext):
+    st.info(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
   else:
+    st.info(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
   try:
     lang = langcodes.Language.get(langtext)
   #  st.write(f"{lang} is the BCP-47 tag.")
     if "unknown" in lang.display_name().lower():
+      st.info(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
       lang = None
   except langcodes.LanguageTagError as e:
+    st.info(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
     lang = None
     try:
       found = langcodes.find(langtext)
       lang = found
+      st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
     except LookupError as e:
+      st.error("## Result: failure!")
+      st.error(f"Unable to look up language code. But all hope is not lost...")
       st.write(f"* You can also try https://r12a.github.io/app-subtags/")
       st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
       lang = None
+  t_variant = None
   #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
   if lang is not None:
     b_variant = lang.to_alpha3(variant='B')
     t_variant = lang.to_alpha3(variant='T')
     broader_tags = lang.broader_tags()
     standardized_tag = langcodes.standardize_tag(lang)
+    st.write(f"## BCP-47 Results: probably use '{standardized_tag}'")
     # TODO: make a results dictionary so it's easy to copy-paste?
     st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
     st.write(f"Breakdown of tag components:")
     st.write(f"### Language Subtag Search Tool")
     st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
     st.write("### Older / Related Codes")
     st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
     elif b_obsolete_codes:
       st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
       st.write(b_obsolete_codes)
+  st.write(f"### Glottolog")
+  languoid_id = try_retrieving_glottolog_id(langtext)
+  if languoid_id:
+    st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
+  # FIXME: fix this to display something if there's an ISO code to try
+  if t_variant:
+    st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
+    if t_variant != b_variant:
+      st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
+  st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
+  results_from_vachan = try_searching_vachan_engine(langtext)
+  if results_from_vachan:
+    st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
+    st.write(results_from_vachan)
 if __name__ == "__main__":